diff --git a/benchmarks/qwen3-tts/README.md b/benchmarks/qwen3-tts/README.md new file mode 100644 index 0000000000..b038669e86 --- /dev/null +++ b/benchmarks/qwen3-tts/README.md @@ -0,0 +1,100 @@ +# Qwen3-TTS Benchmark + +Benchmarks for Qwen3-TTS text-to-speech models, comparing vLLM-Omni streaming serving against HuggingFace Transformers offline inference. + +## Prerequisites + +```bash +pip install matplotlib aiohttp soundfile numpy tqdm +pip install qwen_tts # for HF baseline +``` + +## Quick Start + +Run the full benchmark (vllm-omni + HF baseline) with a single command: + +```bash +cd benchmarks/qwen3-tts +bash run_benchmark.sh +``` + +Results (JSON + PNG plots) are saved to `results/`. + +### Common options + +```bash +# Only vllm-omni (skip HF baseline) +bash run_benchmark.sh --async-only + +# Only HF baseline +bash run_benchmark.sh --hf-only + +# Use a different model (e.g. 1.7B) +MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice bash run_benchmark.sh --async-only + +# Use batch_size=4 config for higher throughput +STAGE_CONFIG=vllm_omni/configs/qwen3_tts_bs4.yaml bash run_benchmark.sh --async-only + +# Custom GPU, prompt count, concurrency levels +GPU_DEVICE=1 NUM_PROMPTS=20 CONCURRENCY="1 4" bash run_benchmark.sh +``` + +## Manual Steps + +### 1) Start the vLLM-Omni server + +```bash +CUDA_VISIBLE_DEVICES=0 python -m vllm_omni.entrypoints.cli.main serve \ + "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice" \ + --omni --host 127.0.0.1 --port 8000 \ + --stage-configs-path benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs1.yaml \ + --trust-remote-code +``` + +### 2) Run online serving benchmark + +```bash +python vllm_omni/bench_tts_serve.py \ + --port 8000 \ + --num-prompts 50 \ + --max-concurrency 1 4 10 \ + --config-name "async_chunk" \ + --result-dir results/ +``` + +### 3) Run HuggingFace baseline + +```bash +python transformers/bench_tts_hf.py \ + --model "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice" \ + --num-prompts 50 \ + --gpu-device 0 \ + --result-dir results/ +``` + +### 4) Generate comparison plots + +```bash +python plot_results.py \ + --results results/bench_async_chunk_*.json results/bench_hf_transformers_*.json \ + --labels "vllm-omni" "hf_transformers" \ + --output results/comparison.png +``` + +## Stage Configs + +| Config | Batch Size | Description | +|--------|:----------:|-------------| +| `vllm_omni/configs/qwen3_tts_bs1.yaml` | 1 | Single-request processing (lower latency) | +| `vllm_omni/configs/qwen3_tts_bs4.yaml` | 4 | Concurrent request processing (higher throughput) | + +Both configs use a 2-stage pipeline (Talker -> Code2Wav) with `async_chunk` streaming enabled. The `SharedMemoryConnector` streams codec frames (25-frame chunks with 25-frame context overlap) between stages. + +The model is specified via the CLI `--model` flag (or `MODEL` env var in `run_benchmark.sh`), so the same configs work for both the 0.6B and 1.7B model variants. + +## Metrics + +- **TTFP (Time to First Audio Packet)**: Time from request to first audio chunk (streaming latency) +- **E2E (End-to-End Latency)**: Total time from request to complete audio response +- **RTF (Real-Time Factor)**: E2E latency / audio duration. RTF < 1.0 means faster-than-real-time synthesis +- **Throughput**: Total audio seconds generated per wall-clock second diff --git a/benchmarks/qwen3-tts/plot_results.py b/benchmarks/qwen3-tts/plot_results.py new file mode 100644 index 0000000000..e750101e32 --- /dev/null +++ b/benchmarks/qwen3-tts/plot_results.py @@ -0,0 +1,254 @@ +"""Plot Qwen3-TTS benchmark results. + +Generates comparison bar charts similar to the async_chunk design doc: +- TTFP (Time-to-First-Packet) across concurrency levels +- E2E latency across concurrency levels +- RTF (Real-Time Factor) across concurrency levels + +Usage: + # Compare two configs (async_chunk vs no_async_chunk): + python plot_results.py \ + --results results/bench_async_chunk_*.json results/bench_no_async_chunk_*.json \ + --labels "async_chunk" "no_async_chunk" \ + --output results/qwen3_tts_benchmark.png + + # Single config: + python plot_results.py \ + --results results/bench_async_chunk_*.json \ + --labels "async_chunk" \ + --output results/qwen3_tts_benchmark.png +""" + +import argparse +import json +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np + + +def load_results(result_files: list[str]) -> list[list[dict]]: + """Load benchmark results from JSON files.""" + all_results = [] + for f in result_files: + with open(f) as fh: + data = json.load(fh) + all_results.append(data) + return all_results + + +def plot_comparison( + all_results: list[list[dict]], + labels: list[str], + output_path: str, + title_prefix: str = "Qwen3-TTS", +): + """Generate comparison bar charts.""" + n_configs = len(all_results) + + # Collect concurrency levels present in ALL configs (skip missing data) + all_concurrencies = [set(r["concurrency"] for r in results) for results in all_results] + concurrencies = sorted(set.union(*all_concurrencies)) + + # Build data arrays, using None for missing concurrency levels + ttfp_data = {label: [] for label in labels} + e2e_data = {label: [] for label in labels} + rtf_data = {label: [] for label in labels} + throughput_data = {label: [] for label in labels} + + for results, label in zip(all_results, labels): + conc_map = {r["concurrency"]: r for r in results} + for c in concurrencies: + r = conc_map.get(c) + ttfp_data[label].append(r["mean_ttfp_ms"] if r else None) + e2e_data[label].append(r["mean_e2e_ms"] if r else None) + rtf_data[label].append(r["mean_rtf"] if r else None) + throughput_data[label].append(r["audio_throughput"] if r else None) + + fig, axes = plt.subplots(2, 2, figsize=(14, 10)) + fig.suptitle(f"{title_prefix} Performance Benchmark", fontsize=16, fontweight="bold") + + x = np.arange(len(concurrencies)) + width = 0.35 if n_configs == 2 else 0.5 + if n_configs > 1: + offsets = np.linspace(-width / 2 * (n_configs - 1), width / 2 * (n_configs - 1), n_configs) + else: + offsets = [0] + + colors = ["#2196F3", "#FF5722", "#4CAF50", "#FFC107"] + + def plot_metric(ax, data_dict, ylabel, title, fmt=".1f"): + bars = [] + for i, (label, values) in enumerate(data_dict.items()): + # Replace None with 0 for plotting, but track which are missing + plot_values = [v if v is not None else 0 for v in values] + color = colors[i % len(colors)] + bar = ax.bar(x + offsets[i], plot_values, width, label=label, color=color, alpha=0.85) + bars.append(bar) + # Add value labels on bars (skip None/missing data) + max_val = max((v for v in values if v is not None), default=1) + for rect, val in zip(bar, values): + if val is not None and val > 0: + ax.text( + rect.get_x() + rect.get_width() / 2, + rect.get_height() + max_val * 0.02, + f"{val:{fmt}}", + ha="center", + va="bottom", + fontsize=9, + fontweight="bold", + ) + ax.set_xlabel("Concurrency", fontsize=12) + ax.set_ylabel(ylabel, fontsize=12) + ax.set_title(title, fontsize=13, fontweight="bold") + ax.set_xticks(x) + ax.set_xticklabels([str(c) for c in concurrencies]) + ax.legend(fontsize=10) + ax.grid(axis="y", alpha=0.3) + ax.set_axisbelow(True) + + plot_metric(axes[0, 0], ttfp_data, "TTFP (ms)", "Time to First Audio Packet (TTFP)") + plot_metric(axes[0, 1], e2e_data, "E2E Latency (ms)", "End-to-End Latency (E2E)") + plot_metric(axes[1, 0], rtf_data, "RTF", "Real-Time Factor (RTF)", fmt=".3f") + plot_metric(axes[1, 1], throughput_data, "Audio-sec / Wall-sec", "Audio Throughput", fmt=".2f") + + plt.tight_layout() + plt.savefig(output_path, dpi=150, bbox_inches="tight") + print(f"Plot saved to {output_path}") + plt.close() + + +def plot_single_summary(results: list[dict], label: str, output_path: str): + """Generate a single-config summary with percentile breakdown.""" + concurrencies = [r["concurrency"] for r in results] + + fig, axes = plt.subplots(1, 3, figsize=(16, 5)) + fig.suptitle(f"Qwen3-TTS Benchmark - {label}", fontsize=15, fontweight="bold") + + # TTFP breakdown + ax = axes[0] + means = [r["mean_ttfp_ms"] for r in results] + medians = [r["median_ttfp_ms"] for r in results] + p90s = [r["p90_ttfp_ms"] for r in results] + p99s = [r["p99_ttfp_ms"] for r in results] + x = np.arange(len(concurrencies)) + w = 0.2 + ax.bar(x - 1.5 * w, means, w, label="mean", color="#2196F3") + ax.bar(x - 0.5 * w, medians, w, label="median", color="#4CAF50") + ax.bar(x + 0.5 * w, p90s, w, label="p90", color="#FF9800") + ax.bar(x + 1.5 * w, p99s, w, label="p99", color="#F44336") + ax.set_xticks(x) + ax.set_xticklabels([str(c) for c in concurrencies]) + ax.set_xlabel("Concurrency") + ax.set_ylabel("TTFP (ms)") + ax.set_title("Time to First Audio Packet") + ax.legend(fontsize=9) + ax.grid(axis="y", alpha=0.3) + + # E2E breakdown + ax = axes[1] + means = [r["mean_e2e_ms"] for r in results] + medians = [r["median_e2e_ms"] for r in results] + p90s = [r["p90_e2e_ms"] for r in results] + p99s = [r["p99_e2e_ms"] for r in results] + ax.bar(x - 1.5 * w, means, w, label="mean", color="#2196F3") + ax.bar(x - 0.5 * w, medians, w, label="median", color="#4CAF50") + ax.bar(x + 0.5 * w, p90s, w, label="p90", color="#FF9800") + ax.bar(x + 1.5 * w, p99s, w, label="p99", color="#F44336") + ax.set_xticks(x) + ax.set_xticklabels([str(c) for c in concurrencies]) + ax.set_xlabel("Concurrency") + ax.set_ylabel("E2E Latency (ms)") + ax.set_title("End-to-End Latency") + ax.legend(fontsize=9) + ax.grid(axis="y", alpha=0.3) + + # RTF + ax = axes[2] + means = [r["mean_rtf"] for r in results] + medians = [r["median_rtf"] for r in results] + ax.bar(x - 0.15, means, 0.3, label="mean", color="#2196F3") + ax.bar(x + 0.15, medians, 0.3, label="median", color="#4CAF50") + ax.set_xticks(x) + ax.set_xticklabels([str(c) for c in concurrencies]) + ax.set_xlabel("Concurrency") + ax.set_ylabel("RTF") + ax.set_title("Real-Time Factor") + ax.legend(fontsize=9) + ax.grid(axis="y", alpha=0.3) + + plt.tight_layout() + plt.savefig(output_path, dpi=150, bbox_inches="tight") + print(f"Plot saved to {output_path}") + plt.close() + + +def print_comparison_table(all_results: list[list[dict]], labels: list[str]): + """Print a markdown-formatted comparison table.""" + concurrencies = sorted(set(r["concurrency"] for r in all_results[0])) + + print("\n## Benchmark Results\n") + header = "| Metric | Concurrency |" + sep = "| --- | --- |" + for label in labels: + header += f" {label} |" + sep += " --- |" + print(header) + print(sep) + + for metric, key, fmt in [ + ("TTFP (ms)", "mean_ttfp_ms", ".1f"), + ("E2E (ms)", "mean_e2e_ms", ".1f"), + ("RTF", "mean_rtf", ".3f"), + ("Throughput (audio-s/s)", "audio_throughput", ".2f"), + ]: + for c in concurrencies: + row = f"| {metric} | {c} |" + for results in all_results: + conc_map = {r["concurrency"]: r for r in results} + val = conc_map.get(c, {}).get(key, 0) + row += f" {val:{fmt}} |" + print(row) + + # Improvement calculation (only if 2 configs) + if len(all_results) == 2: + print(f"\n## Improvement ({labels[0]} vs {labels[1]})\n") + print("| Metric | Concurrency | Improvement |") + print("| --- | --- | --- |") + for metric, key in [("TTFP", "mean_ttfp_ms"), ("E2E", "mean_e2e_ms"), ("RTF", "mean_rtf")]: + for c in concurrencies: + m0 = {r["concurrency"]: r for r in all_results[0]} + m1 = {r["concurrency"]: r for r in all_results[1]} + v0 = m0.get(c, {}).get(key, 0) + v1 = m1.get(c, {}).get(key, 0) + if v1 > 0: + pct = (v1 - v0) / v1 * 100 + print(f"| {metric} | {c} | {pct:+.1f}% |") + + +def parse_args(): + parser = argparse.ArgumentParser(description="Plot Qwen3-TTS benchmark results") + parser.add_argument( + "--results", type=str, nargs="+", required=True, help="Path(s) to result JSON files (one per config)" + ) + parser.add_argument( + "--labels", type=str, nargs="+", required=True, help="Labels for each config (must match --results count)" + ) + parser.add_argument("--output", type=str, default="results/qwen3_tts_benchmark.png", help="Output image path") + parser.add_argument("--title", type=str, default="Qwen3-TTS", help="Title prefix for the plot") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + assert len(args.results) == len(args.labels), "--results and --labels must have the same count" + + all_results = load_results(args.results) + print_comparison_table(all_results, args.labels) + + Path(args.output).parent.mkdir(parents=True, exist_ok=True) + + if len(all_results) == 1: + plot_single_summary(all_results[0], args.labels[0], args.output) + else: + plot_comparison(all_results, args.labels, args.output, title_prefix=args.title) diff --git a/benchmarks/qwen3-tts/results/.gitignore b/benchmarks/qwen3-tts/results/.gitignore new file mode 100644 index 0000000000..5b6759ef71 --- /dev/null +++ b/benchmarks/qwen3-tts/results/.gitignore @@ -0,0 +1,3 @@ +# Benchmark results are machine-specific - do not commit +* +!.gitignore diff --git a/benchmarks/qwen3-tts/run_benchmark.sh b/benchmarks/qwen3-tts/run_benchmark.sh new file mode 100755 index 0000000000..ef85d64d6d --- /dev/null +++ b/benchmarks/qwen3-tts/run_benchmark.sh @@ -0,0 +1,272 @@ +#!/bin/bash +# Qwen3-TTS Benchmark Runner +# +# Compares vllm-omni streaming serving vs HuggingFace transformers offline inference. +# Produces JSON results and comparison plots. +# +# Usage: +# # Full comparison (vllm-omni + HF): +# bash run_benchmark.sh +# +# # Only vllm-omni async_chunk config: +# bash run_benchmark.sh --async-only +# +# # Only HuggingFace baseline: +# bash run_benchmark.sh --hf-only +# +# # vllm-omni only (skip HF): +# bash run_benchmark.sh --skip-hf +# +# # Custom settings: +# GPU_DEVICE=1 NUM_PROMPTS=20 CONCURRENCY="1 4" bash run_benchmark.sh +# +# # Use 1.7B model: +# MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice bash run_benchmark.sh --async-only +# +# # Use batch_size=4 config: +# STAGE_CONFIG=vllm_omni/configs/qwen3_tts_bs4.yaml bash run_benchmark.sh --async-only +# +# Environment variables: +# GPU_DEVICE - GPU index to use (default: 0) +# NUM_PROMPTS - Number of prompts per concurrency level (default: 50) +# CONCURRENCY - Space-separated concurrency levels (default: "1 4 10") +# MODEL - Model name (default: Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice) +# PORT - Server port (default: 8000) +# GPU_MEM_TALKER - gpu_memory_utilization for talker stage (default: 0.3) +# GPU_MEM_CODE2WAV - gpu_memory_utilization for code2wav stage (default: 0.2) +# STAGE_CONFIG - Path to stage config YAML (default: configs/qwen3_tts_bs1.yaml) + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" + +# Defaults +GPU_DEVICE="${GPU_DEVICE:-0}" +NUM_PROMPTS="${NUM_PROMPTS:-50}" +CONCURRENCY="${CONCURRENCY:-1 4 10}" +MODEL="${MODEL:-Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice}" +PORT="${PORT:-8000}" +GPU_MEM_TALKER="${GPU_MEM_TALKER:-0.3}" +GPU_MEM_CODE2WAV="${GPU_MEM_CODE2WAV:-0.2}" +NUM_WARMUPS="${NUM_WARMUPS:-3}" +STAGE_CONFIG="${STAGE_CONFIG:-vllm_omni/configs/qwen3_tts_bs1.yaml}" +RESULT_DIR="${SCRIPT_DIR}/results" +TIMESTAMP="$(date +%Y%m%d_%H%M%S)" + +# Parse args +RUN_ASYNC=true +RUN_HF=true +for arg in "$@"; do + case "$arg" in + --async-only) RUN_HF=false ;; + --hf-only) RUN_ASYNC=false ;; + --skip-hf) RUN_HF=false ;; + esac +done + +mkdir -p "${RESULT_DIR}" + +echo "============================================================" +echo " Qwen3-TTS Benchmark" +echo "============================================================" +echo " GPU: ${GPU_DEVICE}" +echo " Model: ${MODEL}" +echo " Prompts: ${NUM_PROMPTS}" +echo " Concurrency: ${CONCURRENCY}" +echo " Port: ${PORT}" +echo " Stage config: ${STAGE_CONFIG}" +echo " Results: ${RESULT_DIR}" +echo "============================================================" + +# Prepare stage config with correct GPU device and memory settings +prepare_config() { + local config_template="$1" + local config_name="$2" + local output_path="${RESULT_DIR}/${config_name}_stage_config.yaml" + + # Use sed to patch GPU device and memory utilization + sed \ + -e "s/devices: \"0\"/devices: \"${GPU_DEVICE}\"/g" \ + -e "s/gpu_memory_utilization: 0.3/gpu_memory_utilization: ${GPU_MEM_TALKER}/g" \ + -e "s/gpu_memory_utilization: 0.2/gpu_memory_utilization: ${GPU_MEM_CODE2WAV}/g" \ + "${config_template}" > "${output_path}" + + echo "${output_path}" +} + +# Start server and wait for it to be ready +start_server() { + local stage_config="$1" + local config_name="$2" + local log_file="${RESULT_DIR}/server_${config_name}_${TIMESTAMP}.log" + + echo "" + echo "Starting server with config: ${config_name}" + echo " Stage config: ${stage_config}" + echo " Log file: ${log_file}" + + VLLM_WORKER_MULTIPROC_METHOD=spawn \ + CUDA_VISIBLE_DEVICES="${GPU_DEVICE}" \ + python -m vllm_omni.entrypoints.cli.main serve "${MODEL}" \ + --omni \ + --host 127.0.0.1 \ + --port "${PORT}" \ + --stage-configs-path "${stage_config}" \ + --stage-init-timeout 120 \ + --trust-remote-code \ + --disable-log-stats \ + > "${log_file}" 2>&1 & + + SERVER_PID=$! + echo " Server PID: ${SERVER_PID}" + + # Wait for server to be ready + echo " Waiting for server to be ready..." + local max_wait=300 + local waited=0 + while [ ${waited} -lt ${max_wait} ]; do + if curl -sf "http://127.0.0.1:${PORT}/v1/models" > /dev/null 2>&1; then + echo " Server is ready! (waited ${waited}s)" + return 0 + fi + # Check if process is still alive + if ! kill -0 ${SERVER_PID} 2>/dev/null; then + echo " ERROR: Server process died. Check log: ${log_file}" + tail -20 "${log_file}" + return 1 + fi + sleep 2 + waited=$((waited + 2)) + done + + echo " ERROR: Server did not start within ${max_wait}s. Check log: ${log_file}" + kill ${SERVER_PID} 2>/dev/null || true + return 1 +} + +# Stop the server +stop_server() { + if [ -n "${SERVER_PID:-}" ]; then + echo " Stopping server (PID: ${SERVER_PID})..." + kill ${SERVER_PID} 2>/dev/null || true + wait ${SERVER_PID} 2>/dev/null || true + # Kill any remaining child processes on the port + local pids + pids=$(lsof -ti:${PORT} 2>/dev/null || true) + if [ -n "${pids}" ]; then + echo " Cleaning up remaining processes on port ${PORT}..." + echo "${pids}" | xargs kill -9 2>/dev/null || true + fi + echo " Server stopped." + SERVER_PID="" + fi +} + +# Cleanup on exit +trap 'stop_server' EXIT + +# Run benchmark for a given config +run_bench() { + local config_name="$1" + local config_template="$2" + + echo "" + echo "============================================================" + echo " Benchmarking: ${config_name}" + echo "============================================================" + + local stage_config + stage_config=$(prepare_config "${config_template}" "${config_name}") + + start_server "${stage_config}" "${config_name}" + + # Convert concurrency string to args + local conc_args="" + for c in ${CONCURRENCY}; do + conc_args="${conc_args} ${c}" + done + + cd "${PROJECT_ROOT}" + python "${SCRIPT_DIR}/vllm_omni/bench_tts_serve.py" \ + --host 127.0.0.1 \ + --port "${PORT}" \ + --num-prompts "${NUM_PROMPTS}" \ + --max-concurrency ${conc_args} \ + --num-warmups "${NUM_WARMUPS}" \ + --config-name "${config_name}" \ + --result-dir "${RESULT_DIR}" + + stop_server + + # Allow GPU memory to settle + sleep 5 +} + +# Run vllm-omni benchmark +if [ "${RUN_ASYNC}" = true ]; then + run_bench "async_chunk" "${SCRIPT_DIR}/${STAGE_CONFIG}" +fi + +# Run HuggingFace baseline benchmark +if [ "${RUN_HF}" = true ]; then + echo "" + echo "============================================================" + echo " Benchmarking: HuggingFace transformers (offline)" + echo "============================================================" + + cd "${PROJECT_ROOT}" + python "${SCRIPT_DIR}/transformers/bench_tts_hf.py" \ + --model "${MODEL}" \ + --num-prompts "${NUM_PROMPTS}" \ + --num-warmups "${NUM_WARMUPS}" \ + --gpu-device "${GPU_DEVICE}" \ + --config-name "hf_transformers" \ + --result-dir "${RESULT_DIR}" + + # Allow GPU memory to settle + sleep 5 +fi + +# Plot results +echo "" +echo "============================================================" +echo " Generating plots..." +echo "============================================================" + +RESULT_FILES="" +LABELS="" + +if [ "${RUN_ASYNC}" = true ]; then + ASYNC_FILE=$(ls -t "${RESULT_DIR}"/bench_async_chunk_*.json 2>/dev/null | head -1) + if [ -n "${ASYNC_FILE}" ]; then + RESULT_FILES="${ASYNC_FILE}" + LABELS="async_chunk" + fi +fi + +if [ "${RUN_HF}" = true ]; then + HF_FILE=$(ls -t "${RESULT_DIR}"/bench_hf_transformers_*.json 2>/dev/null | head -1) + if [ -n "${HF_FILE}" ]; then + if [ -n "${RESULT_FILES}" ]; then + RESULT_FILES="${RESULT_FILES} ${HF_FILE}" + LABELS="${LABELS} hf_transformers" + else + RESULT_FILES="${HF_FILE}" + LABELS="hf_transformers" + fi + fi +fi + +if [ -n "${RESULT_FILES}" ]; then + python "${SCRIPT_DIR}/plot_results.py" \ + --results ${RESULT_FILES} \ + --labels ${LABELS} \ + --output "${RESULT_DIR}/qwen3_tts_benchmark_${TIMESTAMP}.png" +fi + +echo "" +echo "============================================================" +echo " Benchmark complete!" +echo " Results: ${RESULT_DIR}" +echo "============================================================" diff --git a/benchmarks/qwen3-tts/transformers/bench_tts_hf.py b/benchmarks/qwen3-tts/transformers/bench_tts_hf.py new file mode 100644 index 0000000000..63cdef6d58 --- /dev/null +++ b/benchmarks/qwen3-tts/transformers/bench_tts_hf.py @@ -0,0 +1,256 @@ +"""Benchmark Qwen3-TTS using HuggingFace transformers (qwen_tts library). + +Measures E2E latency, RTF, and audio duration for offline (non-serving) inference. +Results are saved in the same JSON format as bench_tts_serve.py for unified plotting. + +Usage: + python bench_tts_hf.py \ + --model Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice \ + --num-prompts 50 \ + --num-warmups 3 \ + --gpu-device 0 \ + --result-dir results/ +""" + +import argparse +import json +import time +from dataclasses import asdict, dataclass, field +from datetime import datetime +from pathlib import Path + +import numpy as np +import soundfile as sf +import torch + +PROMPTS = [ + "Hello, welcome to the voice synthesis benchmark test.", + "She said she would be here by noon, but nobody showed up.", + "The quick brown fox jumps over the lazy dog near the riverbank.", + "I can't believe how beautiful the sunset looks from up here on the mountain.", + "Please remember to bring your identification documents to the appointment tomorrow morning.", + "Have you ever wondered what it would be like to travel through time and visit ancient civilizations?", + "The restaurant on the corner serves the best pasta I have ever tasted in my entire life.", + "After the meeting, we should discuss the quarterly results and plan for the next phase.", + "Learning a new language takes patience, practice, and a genuine curiosity about other cultures.", + "The train leaves at half past seven, so we need to arrive at the station before then.", + "Could you please turn down the music a little bit, I'm trying to concentrate on my work.", + "It was a dark and stormy night when the old lighthouse keeper heard a knock at the door.", +] + + +@dataclass +class BenchmarkResult: + config_name: str = "" + concurrency: int = 1 # always 1 for offline + num_prompts: int = 0 + completed: int = 0 + failed: int = 0 + duration_s: float = 0.0 + # TTFP stats - not applicable for HF offline, set to E2E for compatibility + mean_ttfp_ms: float = 0.0 + median_ttfp_ms: float = 0.0 + std_ttfp_ms: float = 0.0 + p90_ttfp_ms: float = 0.0 + p95_ttfp_ms: float = 0.0 + p99_ttfp_ms: float = 0.0 + # E2E stats (ms) + mean_e2e_ms: float = 0.0 + median_e2e_ms: float = 0.0 + std_e2e_ms: float = 0.0 + p90_e2e_ms: float = 0.0 + p95_e2e_ms: float = 0.0 + p99_e2e_ms: float = 0.0 + # RTF stats + mean_rtf: float = 0.0 + median_rtf: float = 0.0 + std_rtf: float = 0.0 + # Audio stats + mean_audio_duration_s: float = 0.0 + total_audio_duration_s: float = 0.0 + audio_throughput: float = 0.0 + request_throughput: float = 0.0 + # Per-request details + per_request: list = field(default_factory=list) + + +def run_benchmark(args): + from qwen_tts import Qwen3TTSModel + + device = f"cuda:{args.gpu_device}" + print(f"Loading model: {args.model} on {device}") + model = Qwen3TTSModel.from_pretrained( + args.model, + device_map=device, + dtype=torch.bfloat16, + ) + print("Model loaded.") + + # Build prompt list + prompts = [PROMPTS[i % len(PROMPTS)] for i in range(args.num_prompts)] + + # Warmup + if args.num_warmups > 0: + print(f"Warming up with {args.num_warmups} requests...") + for i in range(args.num_warmups): + p = PROMPTS[i % len(PROMPTS)] + wavs, sr = model.generate_custom_voice( + text=p, + language=args.language, + speaker=args.voice, + ) + # Sync GPU + torch.cuda.synchronize(device) + print("Warmup done.") + + # Benchmark + print(f"Running {args.num_prompts} requests sequentially...") + e2e_times = [] + rtfs = [] + audio_durations = [] + per_request = [] + failed = 0 + + audio_dir = None + if args.save_audio: + audio_dir = Path(args.result_dir) / "audio_hf" + audio_dir.mkdir(parents=True, exist_ok=True) + + total_start = time.perf_counter() + + for i, prompt in enumerate(prompts): + try: + torch.cuda.synchronize(device) + st = time.perf_counter() + + wavs, sr = model.generate_custom_voice( + text=prompt, + language=args.language, + speaker=args.voice, + ) + + torch.cuda.synchronize(device) + elapsed = time.perf_counter() - st + + # Compute audio duration + audio_samples = wavs[0] + if isinstance(audio_samples, torch.Tensor): + audio_samples = audio_samples.cpu().numpy() + audio_dur = len(audio_samples) / sr + + rtf = elapsed / audio_dur if audio_dur > 0 else 0.0 + + e2e_times.append(elapsed) + rtfs.append(rtf) + audio_durations.append(audio_dur) + per_request.append( + { + "e2e_ms": elapsed * 1000, + "ttfp_ms": elapsed * 1000, # no streaming, TTFP = E2E + "rtf": rtf, + "audio_duration_s": audio_dur, + "prompt": prompt, + } + ) + + if audio_dir: + sf.write(str(audio_dir / f"output_{i:04d}.wav"), audio_samples, sr) + + if (i + 1) % 10 == 0 or i == 0: + print( + f" [{i + 1}/{args.num_prompts}] e2e={elapsed * 1000:.0f}ms rtf={rtf:.3f} audio={audio_dur:.2f}s" + ) + + except Exception as e: + print(f" [{i + 1}/{args.num_prompts}] FAILED: {e}") + failed += 1 + + total_duration = time.perf_counter() - total_start + completed = len(e2e_times) + + # Compute stats + result = BenchmarkResult( + config_name=args.config_name, + concurrency=1, + num_prompts=args.num_prompts, + completed=completed, + failed=failed, + duration_s=total_duration, + ) + + if e2e_times: + e2e_ms = [t * 1000 for t in e2e_times] + + result.mean_e2e_ms = float(np.mean(e2e_ms)) + result.median_e2e_ms = float(np.median(e2e_ms)) + result.std_e2e_ms = float(np.std(e2e_ms)) + result.p90_e2e_ms = float(np.percentile(e2e_ms, 90)) + result.p95_e2e_ms = float(np.percentile(e2e_ms, 95)) + result.p99_e2e_ms = float(np.percentile(e2e_ms, 99)) + + # For HF offline, TTFP = E2E (no streaming) + result.mean_ttfp_ms = result.mean_e2e_ms + result.median_ttfp_ms = result.median_e2e_ms + result.std_ttfp_ms = result.std_e2e_ms + result.p90_ttfp_ms = result.p90_e2e_ms + result.p95_ttfp_ms = result.p95_e2e_ms + result.p99_ttfp_ms = result.p99_e2e_ms + + result.mean_rtf = float(np.mean(rtfs)) + result.median_rtf = float(np.median(rtfs)) + result.std_rtf = float(np.std(rtfs)) + + result.mean_audio_duration_s = float(np.mean(audio_durations)) + result.total_audio_duration_s = float(np.sum(audio_durations)) + result.audio_throughput = result.total_audio_duration_s / total_duration + result.request_throughput = completed / total_duration + result.per_request = per_request + + # Print summary + print(f"\n{'=' * 60}") + print(f" HuggingFace Offline Benchmark: {args.config_name}") + print(f" Completed: {completed} | Failed: {failed}") + print(f" Total duration: {total_duration:.2f}s") + print( + f" {'E2E (ms):':<25} mean={result.mean_e2e_ms:.1f} median={result.median_e2e_ms:.1f}" + f" p90={result.p90_e2e_ms:.1f} p99={result.p99_e2e_ms:.1f}" + ) + print(f" {'RTF:':<25} mean={result.mean_rtf:.3f} median={result.median_rtf:.3f}") + print(f" {'Audio throughput:':<25} {result.audio_throughput:.2f} audio-sec/wall-sec") + print(f" {'Request throughput:':<25} {result.request_throughput:.2f} req/s") + print(f"{'=' * 60}\n") + + # Save results (as a list with single concurrency=1 entry, matching serve format) + result_dir = Path(args.result_dir) + result_dir.mkdir(parents=True, exist_ok=True) + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + result_file = result_dir / f"bench_{args.config_name}_{timestamp}.json" + + with open(result_file, "w") as f: + json.dump([asdict(result)], f, indent=2) + print(f"Results saved to {result_file}") + + return result + + +def parse_args(): + parser = argparse.ArgumentParser(description="Qwen3-TTS HuggingFace Benchmark") + parser.add_argument( + "--model", type=str, default="Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice", help="HuggingFace model name or path" + ) + parser.add_argument("--num-prompts", type=int, default=50) + parser.add_argument("--num-warmups", type=int, default=3) + parser.add_argument("--gpu-device", type=int, default=0) + parser.add_argument("--voice", type=str, default="Vivian") + parser.add_argument("--language", type=str, default="English") + parser.add_argument( + "--config-name", type=str, default="hf_transformers", help="Label for this config (used in filenames)" + ) + parser.add_argument("--result-dir", type=str, default="results") + parser.add_argument("--save-audio", action="store_true", help="Save generated audio files") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + run_benchmark(args) diff --git a/benchmarks/qwen3-tts/vllm_omni/bench_tts_serve.py b/benchmarks/qwen3-tts/vllm_omni/bench_tts_serve.py new file mode 100644 index 0000000000..d934969283 --- /dev/null +++ b/benchmarks/qwen3-tts/vllm_omni/bench_tts_serve.py @@ -0,0 +1,325 @@ +"""Benchmark client for Qwen3-TTS via /v1/audio/speech endpoint. + +Measures TTFP (Time-to-First-Packet), E2E latency, and RTF (Real-Time Factor) +across configurable concurrency levels. Saves results as JSON for plotting. + +Usage: + python bench_tts_serve.py \ + --host 127.0.0.1 --port 8000 \ + --num-prompts 50 \ + --max-concurrency 1 4 10 \ + --result-dir results/ +""" + +import argparse +import asyncio +import json +import time +from dataclasses import asdict, dataclass, field +from datetime import datetime +from pathlib import Path + +import aiohttp +import numpy as np +from tqdm.asyncio import tqdm + +PROMPTS = [ + "Hello, welcome to the voice synthesis benchmark test.", + "She said she would be here by noon, but nobody showed up.", + "The quick brown fox jumps over the lazy dog near the riverbank.", + "I can't believe how beautiful the sunset looks from up here on the mountain.", + "Please remember to bring your identification documents to the appointment tomorrow morning.", + "Have you ever wondered what it would be like to travel through time and visit ancient civilizations?", + "The restaurant on the corner serves the best pasta I have ever tasted in my entire life.", + "After the meeting, we should discuss the quarterly results and plan for the next phase.", + "Learning a new language takes patience, practice, and a genuine curiosity about other cultures.", + "The train leaves at half past seven, so we need to arrive at the station before then.", + "Could you please turn down the music a little bit, I'm trying to concentrate on my work.", + "It was a dark and stormy night when the old lighthouse keeper heard a knock at the door.", +] + + +@dataclass +class RequestResult: + success: bool = False + ttfp: float = 0.0 # Time to first audio packet (seconds) + e2e: float = 0.0 # End-to-end latency (seconds) + audio_bytes: int = 0 # Total audio bytes received + audio_duration: float = 0.0 # Audio duration in seconds (estimated from PCM) + rtf: float = 0.0 # Real-time factor = e2e / audio_duration + prompt: str = "" + error: str = "" + + +@dataclass +class BenchmarkResult: + config_name: str = "" + concurrency: int = 0 + num_prompts: int = 0 + completed: int = 0 + failed: int = 0 + duration_s: float = 0.0 + # TTFP stats (ms) + mean_ttfp_ms: float = 0.0 + median_ttfp_ms: float = 0.0 + std_ttfp_ms: float = 0.0 + p90_ttfp_ms: float = 0.0 + p95_ttfp_ms: float = 0.0 + p99_ttfp_ms: float = 0.0 + # E2E stats (ms) + mean_e2e_ms: float = 0.0 + median_e2e_ms: float = 0.0 + std_e2e_ms: float = 0.0 + p90_e2e_ms: float = 0.0 + p95_e2e_ms: float = 0.0 + p99_e2e_ms: float = 0.0 + # RTF stats + mean_rtf: float = 0.0 + median_rtf: float = 0.0 + std_rtf: float = 0.0 + # Audio stats + mean_audio_duration_s: float = 0.0 + total_audio_duration_s: float = 0.0 + audio_throughput: float = 0.0 # audio_duration / wall_time + request_throughput: float = 0.0 # requests / second + # Per-request details + per_request: list = field(default_factory=list) + + +def pcm_bytes_to_duration(num_bytes: int, sample_rate: int = 24000, sample_width: int = 2) -> float: + """Convert raw PCM byte count to duration in seconds.""" + num_samples = num_bytes / sample_width + return num_samples / sample_rate + + +async def send_tts_request( + session: aiohttp.ClientSession, + api_url: str, + prompt: str, + voice: str = "vivian", + language: str = "English", + pbar: tqdm | None = None, +) -> RequestResult: + """Send a streaming TTS request and measure latency metrics.""" + payload = { + "input": prompt, + "voice": voice, + "language": language, + "stream": True, + "response_format": "pcm", + } + + result = RequestResult(prompt=prompt) + st = time.perf_counter() + + try: + async with session.post(api_url, json=payload) as response: + if response.status != 200: + result.error = f"HTTP {response.status}: {await response.text()}" + result.success = False + return result + + first_chunk = True + total_bytes = 0 + + async for chunk in response.content.iter_any(): + if first_chunk and len(chunk) > 0: + result.ttfp = time.perf_counter() - st + first_chunk = False + total_bytes += len(chunk) + + result.e2e = time.perf_counter() - st + result.audio_bytes = total_bytes + result.audio_duration = pcm_bytes_to_duration(total_bytes) + + if result.audio_duration > 0: + result.rtf = result.e2e / result.audio_duration + result.success = True + + except Exception as e: + result.error = str(e) + result.success = False + result.e2e = time.perf_counter() - st + + if pbar: + pbar.update(1) + return result + + +async def run_benchmark( + host: str, + port: int, + num_prompts: int, + max_concurrency: int, + num_warmups: int = 3, + voice: str = "vivian", + language: str = "English", +) -> BenchmarkResult: + """Run benchmark at a given concurrency level.""" + api_url = f"http://{host}:{port}/v1/audio/speech" + + connector = aiohttp.TCPConnector( + limit=max_concurrency, + limit_per_host=max_concurrency, + keepalive_timeout=60, + ) + session = aiohttp.ClientSession( + connector=connector, + timeout=aiohttp.ClientTimeout(total=600), + ) + + # Warmup + if num_warmups > 0: + print(f" Warming up with {num_warmups} requests...") + warmup_tasks = [] + for i in range(num_warmups): + prompt = PROMPTS[i % len(PROMPTS)] + warmup_tasks.append(send_tts_request(session, api_url, prompt, voice, language)) + await asyncio.gather(*warmup_tasks) + print(" Warmup done.") + + # Build request list + request_prompts = [PROMPTS[i % len(PROMPTS)] for i in range(num_prompts)] + + # Run benchmark + print(f" Running {num_prompts} requests with concurrency={max_concurrency}...") + semaphore = asyncio.Semaphore(max_concurrency) + pbar = tqdm(total=num_prompts, desc=f" concurrency={max_concurrency}") + + async def limited_request(prompt): + async with semaphore: + return await send_tts_request(session, api_url, prompt, voice, language, pbar) + + start_time = time.perf_counter() + tasks = [asyncio.create_task(limited_request(p)) for p in request_prompts] + results: list[RequestResult] = await asyncio.gather(*tasks) + duration = time.perf_counter() - start_time + pbar.close() + + await session.close() + + # Compute stats + successful = [r for r in results if r.success] + failed = [r for r in results if not r.success] + + bench = BenchmarkResult( + concurrency=max_concurrency, + num_prompts=num_prompts, + completed=len(successful), + failed=len(failed), + duration_s=duration, + ) + + if successful: + ttfps = [r.ttfp * 1000 for r in successful] # convert to ms + e2es = [r.e2e * 1000 for r in successful] + rtfs = [r.rtf for r in successful] + audio_durs = [r.audio_duration for r in successful] + + bench.mean_ttfp_ms = float(np.mean(ttfps)) + bench.median_ttfp_ms = float(np.median(ttfps)) + bench.std_ttfp_ms = float(np.std(ttfps)) + bench.p90_ttfp_ms = float(np.percentile(ttfps, 90)) + bench.p95_ttfp_ms = float(np.percentile(ttfps, 95)) + bench.p99_ttfp_ms = float(np.percentile(ttfps, 99)) + + bench.mean_e2e_ms = float(np.mean(e2es)) + bench.median_e2e_ms = float(np.median(e2es)) + bench.std_e2e_ms = float(np.std(e2es)) + bench.p90_e2e_ms = float(np.percentile(e2es, 90)) + bench.p95_e2e_ms = float(np.percentile(e2es, 95)) + bench.p99_e2e_ms = float(np.percentile(e2es, 99)) + + bench.mean_rtf = float(np.mean(rtfs)) + bench.median_rtf = float(np.median(rtfs)) + bench.std_rtf = float(np.std(rtfs)) + + bench.mean_audio_duration_s = float(np.mean(audio_durs)) + bench.total_audio_duration_s = float(np.sum(audio_durs)) + bench.audio_throughput = bench.total_audio_duration_s / duration + bench.request_throughput = len(successful) / duration + + bench.per_request = [ + { + "ttfp_ms": r.ttfp * 1000, + "e2e_ms": r.e2e * 1000, + "rtf": r.rtf, + "audio_duration_s": r.audio_duration, + "prompt": r.prompt, + } + for r in successful + ] + + # Print summary + print(f"\n{'=' * 60}") + print(f" Concurrency: {max_concurrency} | Completed: {bench.completed} | Failed: {bench.failed}") + print(f" Duration: {duration:.2f}s | Throughput: {bench.request_throughput:.2f} req/s") + print( + f" {'TTFP (ms):':<25} mean={bench.mean_ttfp_ms:.1f} median={bench.median_ttfp_ms:.1f}" + f" p90={bench.p90_ttfp_ms:.1f} p99={bench.p99_ttfp_ms:.1f}" + ) + print( + f" {'E2E (ms):':<25} mean={bench.mean_e2e_ms:.1f} median={bench.median_e2e_ms:.1f}" + f" p90={bench.p90_e2e_ms:.1f} p99={bench.p99_e2e_ms:.1f}" + ) + print(f" {'RTF:':<25} mean={bench.mean_rtf:.3f} median={bench.median_rtf:.3f}") + print(f" {'Audio throughput:':<25} {bench.audio_throughput:.2f} audio-sec/wall-sec") + print(f"{'=' * 60}\n") + + if failed: + for r in failed[:3]: + print(f" [ERROR] {r.error[:200]}") + + return bench + + +async def main(args): + all_results = [] + + for concurrency in args.max_concurrency: + result = await run_benchmark( + host=args.host, + port=args.port, + num_prompts=args.num_prompts, + max_concurrency=concurrency, + num_warmups=args.num_warmups, + voice=args.voice, + language=args.language, + ) + result.config_name = args.config_name + all_results.append(asdict(result)) + + # Save results + result_dir = Path(args.result_dir) + result_dir.mkdir(parents=True, exist_ok=True) + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + result_file = result_dir / f"bench_{args.config_name}_{timestamp}.json" + + with open(result_file, "w") as f: + json.dump(all_results, f, indent=2) + print(f"Results saved to {result_file}") + + return all_results + + +def parse_args(): + parser = argparse.ArgumentParser(description="Qwen3-TTS Benchmark Client") + parser.add_argument("--host", type=str, default="127.0.0.1") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument("--num-prompts", type=int, default=50, help="Number of prompts per concurrency level") + parser.add_argument( # noqa: E501 + "--max-concurrency", type=int, nargs="+", default=[1, 4, 10], help="Concurrency levels to test" + ) + parser.add_argument("--num-warmups", type=int, default=3) + parser.add_argument("--voice", type=str, default="vivian") + parser.add_argument("--language", type=str, default="English") + parser.add_argument( + "--config-name", type=str, default="async_chunk", help="Label for this config (used in filenames)" + ) + parser.add_argument("--result-dir", type=str, default="results") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + asyncio.run(main(args)) diff --git a/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs1.yaml b/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs1.yaml new file mode 100644 index 0000000000..1597f8aa24 --- /dev/null +++ b/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs1.yaml @@ -0,0 +1,96 @@ +# Qwen3-TTS batch_size=1 config (streaming with async_chunk) +# 2-stage pipeline: Talker -> Code2Wav +async_chunk: true +stage_args: + - stage_id: 0 + stage_type: llm + runtime: + devices: "0" + max_batch_size: 1 + engine_args: + model_stage: qwen3_tts + model_arch: Qwen3TTSTalkerForConditionalGeneration + hf_overrides: + architectures: [Qwen3TTSTalkerForConditionalGeneration] + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + enforce_eager: false + trust_remote_code: true + async_scheduling: false + enable_prefix_caching: false + engine_output_type: latent + gpu_memory_utilization: 0.3 + distributed_executor_backend: "mp" + max_num_batched_tokens: 512 + max_model_len: 4096 + custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk + output_connectors: + to_stage_1: connector_of_shared_memory + default_sampling_params: + temperature: 0.9 + top_k: 50 + max_tokens: 4096 + seed: 42 + detokenize: false + repetition_penalty: 1.05 + stop_token_ids: [2150] + + - stage_id: 1 + stage_type: llm + runtime: + devices: "0" + max_batch_size: 1 + engine_args: + model_stage: code2wav + model_arch: Qwen3TTSCode2Wav + hf_overrides: + architectures: [Qwen3TTSCode2Wav] + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + enforce_eager: true + trust_remote_code: true + async_scheduling: false + enable_prefix_caching: false + engine_output_type: audio + gpu_memory_utilization: 0.2 + distributed_executor_backend: "mp" + max_num_batched_tokens: 8192 + max_model_len: 32768 + engine_input_source: [0] + final_output: true + final_output_type: audio + input_connectors: + from_stage_0: connector_of_shared_memory + tts_args: + max_instructions_length: 500 + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 65536 + seed: 42 + detokenize: true + repetition_penalty: 1.0 + +runtime: + enabled: true + defaults: + window_size: -1 + max_inflight: 1 + + connectors: + connector_of_shared_memory: + name: SharedMemoryConnector + extra: + shm_threshold_bytes: 65536 + codec_streaming: true + connector_get_sleep_s: 0.01 + connector_get_max_wait_first_chunk: 3000 + connector_get_max_wait: 300 + codec_chunk_frames: 25 + codec_left_context_frames: 25 + + edges: + - from: 0 + to: 1 + window_size: -1 diff --git a/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs4.yaml b/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs4.yaml new file mode 100644 index 0000000000..18493f3aee --- /dev/null +++ b/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs4.yaml @@ -0,0 +1,97 @@ +# Qwen3-TTS batch_size=4 config (streaming with async_chunk) +# Enables concurrent request processing with max_inflight=4 +# 2-stage pipeline: Talker -> Code2Wav +async_chunk: true +stage_args: + - stage_id: 0 + stage_type: llm + runtime: + devices: "0" + max_batch_size: 4 + engine_args: + model_stage: qwen3_tts + model_arch: Qwen3TTSTalkerForConditionalGeneration + hf_overrides: + architectures: [Qwen3TTSTalkerForConditionalGeneration] + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + enforce_eager: false + trust_remote_code: true + async_scheduling: false + enable_prefix_caching: false + engine_output_type: latent + gpu_memory_utilization: 0.3 + distributed_executor_backend: "mp" + max_num_batched_tokens: 512 + max_model_len: 4096 + custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk + output_connectors: + to_stage_1: connector_of_shared_memory + default_sampling_params: + temperature: 0.9 + top_k: 50 + max_tokens: 4096 + seed: 42 + detokenize: false + repetition_penalty: 1.05 + stop_token_ids: [2150] + + - stage_id: 1 + stage_type: llm + runtime: + devices: "0" + max_batch_size: 4 + engine_args: + model_stage: code2wav + model_arch: Qwen3TTSCode2Wav + hf_overrides: + architectures: [Qwen3TTSCode2Wav] + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + enforce_eager: true + trust_remote_code: true + async_scheduling: false + enable_prefix_caching: false + engine_output_type: audio + gpu_memory_utilization: 0.2 + distributed_executor_backend: "mp" + max_num_batched_tokens: 8192 + max_model_len: 32768 + engine_input_source: [0] + final_output: true + final_output_type: audio + input_connectors: + from_stage_0: connector_of_shared_memory + tts_args: + max_instructions_length: 500 + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 65536 + seed: 42 + detokenize: true + repetition_penalty: 1.0 + +runtime: + enabled: true + defaults: + window_size: -1 + max_inflight: 4 + + connectors: + connector_of_shared_memory: + name: SharedMemoryConnector + extra: + shm_threshold_bytes: 65536 + codec_streaming: true + connector_get_sleep_s: 0.01 + connector_get_max_wait_first_chunk: 3000 + connector_get_max_wait: 300 + codec_chunk_frames: 25 + codec_left_context_frames: 25 + + edges: + - from: 0 + to: 1 + window_size: -1