diff --git a/benchmarks/qwen3-tts/README.md b/benchmarks/qwen3-tts/README.md
new file mode 100644
index 0000000000..b038669e86
--- /dev/null
+++ b/benchmarks/qwen3-tts/README.md
@@ -0,0 +1,100 @@
+# Qwen3-TTS Benchmark
+
+Benchmarks for Qwen3-TTS text-to-speech models, comparing vLLM-Omni streaming serving against HuggingFace Transformers offline inference.
+
+## Prerequisites
+
+```bash
+pip install matplotlib aiohttp soundfile numpy tqdm
+pip install qwen_tts  # for HF baseline
+```
+
+## Quick Start
+
+Run the full benchmark (vllm-omni + HF baseline) with a single command:
+
+```bash
+cd benchmarks/qwen3-tts
+bash run_benchmark.sh
+```
+
+Results (JSON + PNG plots) are saved to `results/`.
+
+### Common options
+
+```bash
+# Only vllm-omni (skip HF baseline)
+bash run_benchmark.sh --async-only
+
+# Only HF baseline
+bash run_benchmark.sh --hf-only
+
+# Use a different model (e.g. 1.7B)
+MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice bash run_benchmark.sh --async-only
+
+# Use batch_size=4 config for higher throughput
+STAGE_CONFIG=vllm_omni/configs/qwen3_tts_bs4.yaml bash run_benchmark.sh --async-only
+
+# Custom GPU, prompt count, concurrency levels
+GPU_DEVICE=1 NUM_PROMPTS=20 CONCURRENCY="1 4" bash run_benchmark.sh
+```
+
+## Manual Steps
+
+### 1) Start the vLLM-Omni server
+
+```bash
+CUDA_VISIBLE_DEVICES=0 python -m vllm_omni.entrypoints.cli.main serve \
+    "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice" \
+    --omni --host 127.0.0.1 --port 8000 \
+    --stage-configs-path benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs1.yaml \
+    --trust-remote-code
+```
+
+### 2) Run online serving benchmark
+
+```bash
+python vllm_omni/bench_tts_serve.py \
+    --port 8000 \
+    --num-prompts 50 \
+    --max-concurrency 1 4 10 \
+    --config-name "async_chunk" \
+    --result-dir results/
+```
+
+### 3) Run HuggingFace baseline
+
+```bash
+python transformers/bench_tts_hf.py \
+    --model "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice" \
+    --num-prompts 50 \
+    --gpu-device 0 \
+    --result-dir results/
+```
+
+### 4) Generate comparison plots
+
+```bash
+python plot_results.py \
+    --results results/bench_async_chunk_*.json results/bench_hf_transformers_*.json \
+    --labels "vllm-omni" "hf_transformers" \
+    --output results/comparison.png
+```
+
+## Stage Configs
+
+| Config | Batch Size | Description |
+|--------|:----------:|-------------|
+| `vllm_omni/configs/qwen3_tts_bs1.yaml` | 1 | Single-request processing (lower latency) |
+| `vllm_omni/configs/qwen3_tts_bs4.yaml` | 4 | Concurrent request processing (higher throughput) |
+
+Both configs use a 2-stage pipeline (Talker -> Code2Wav) with `async_chunk` streaming enabled. The `SharedMemoryConnector` streams codec frames (25-frame chunks with 25-frame context overlap) between stages.
+
+The model is specified via the CLI `--model` flag (or `MODEL` env var in `run_benchmark.sh`), so the same configs work for both the 0.6B and 1.7B model variants.
+
+## Metrics
+
+- **TTFP (Time to First Audio Packet)**: Time from request to first audio chunk (streaming latency)
+- **E2E (End-to-End Latency)**: Total time from request to complete audio response
+- **RTF (Real-Time Factor)**: E2E latency / audio duration. RTF < 1.0 means faster-than-real-time synthesis
+- **Throughput**: Total audio seconds generated per wall-clock second
diff --git a/benchmarks/qwen3-tts/plot_results.py b/benchmarks/qwen3-tts/plot_results.py
new file mode 100644
index 0000000000..e750101e32
--- /dev/null
+++ b/benchmarks/qwen3-tts/plot_results.py
@@ -0,0 +1,254 @@
+"""Plot Qwen3-TTS benchmark results.
+
+Generates comparison bar charts similar to the async_chunk design doc:
+- TTFP (Time-to-First-Packet) across concurrency levels
+- E2E latency across concurrency levels
+- RTF (Real-Time Factor) across concurrency levels
+
+Usage:
+    # Compare two configs (async_chunk vs no_async_chunk):
+    python plot_results.py \
+        --results results/bench_async_chunk_*.json results/bench_no_async_chunk_*.json \
+        --labels "async_chunk" "no_async_chunk" \
+        --output results/qwen3_tts_benchmark.png
+
+    # Single config:
+    python plot_results.py \
+        --results results/bench_async_chunk_*.json \
+        --labels "async_chunk" \
+        --output results/qwen3_tts_benchmark.png
+"""
+
+import argparse
+import json
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+def load_results(result_files: list[str]) -> list[list[dict]]:
+    """Load benchmark results from JSON files."""
+    all_results = []
+    for f in result_files:
+        with open(f) as fh:
+            data = json.load(fh)
+        all_results.append(data)
+    return all_results
+
+
+def plot_comparison(
+    all_results: list[list[dict]],
+    labels: list[str],
+    output_path: str,
+    title_prefix: str = "Qwen3-TTS",
+):
+    """Generate comparison bar charts."""
+    n_configs = len(all_results)
+
+    # Collect concurrency levels present in ALL configs (skip missing data)
+    all_concurrencies = [set(r["concurrency"] for r in results) for results in all_results]
+    concurrencies = sorted(set.union(*all_concurrencies))
+
+    # Build data arrays, using None for missing concurrency levels
+    ttfp_data = {label: [] for label in labels}
+    e2e_data = {label: [] for label in labels}
+    rtf_data = {label: [] for label in labels}
+    throughput_data = {label: [] for label in labels}
+
+    for results, label in zip(all_results, labels):
+        conc_map = {r["concurrency"]: r for r in results}
+        for c in concurrencies:
+            r = conc_map.get(c)
+            ttfp_data[label].append(r["mean_ttfp_ms"] if r else None)
+            e2e_data[label].append(r["mean_e2e_ms"] if r else None)
+            rtf_data[label].append(r["mean_rtf"] if r else None)
+            throughput_data[label].append(r["audio_throughput"] if r else None)
+
+    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+    fig.suptitle(f"{title_prefix} Performance Benchmark", fontsize=16, fontweight="bold")
+
+    x = np.arange(len(concurrencies))
+    width = 0.35 if n_configs == 2 else 0.5
+    if n_configs > 1:
+        offsets = np.linspace(-width / 2 * (n_configs - 1), width / 2 * (n_configs - 1), n_configs)
+    else:
+        offsets = [0]
+
+    colors = ["#2196F3", "#FF5722", "#4CAF50", "#FFC107"]
+
+    def plot_metric(ax, data_dict, ylabel, title, fmt=".1f"):
+        bars = []
+        for i, (label, values) in enumerate(data_dict.items()):
+            # Replace None with 0 for plotting, but track which are missing
+            plot_values = [v if v is not None else 0 for v in values]
+            color = colors[i % len(colors)]
+            bar = ax.bar(x + offsets[i], plot_values, width, label=label, color=color, alpha=0.85)
+            bars.append(bar)
+            # Add value labels on bars (skip None/missing data)
+            max_val = max((v for v in values if v is not None), default=1)
+            for rect, val in zip(bar, values):
+                if val is not None and val > 0:
+                    ax.text(
+                        rect.get_x() + rect.get_width() / 2,
+                        rect.get_height() + max_val * 0.02,
+                        f"{val:{fmt}}",
+                        ha="center",
+                        va="bottom",
+                        fontsize=9,
+                        fontweight="bold",
+                    )
+        ax.set_xlabel("Concurrency", fontsize=12)
+        ax.set_ylabel(ylabel, fontsize=12)
+        ax.set_title(title, fontsize=13, fontweight="bold")
+        ax.set_xticks(x)
+        ax.set_xticklabels([str(c) for c in concurrencies])
+        ax.legend(fontsize=10)
+        ax.grid(axis="y", alpha=0.3)
+        ax.set_axisbelow(True)
+
+    plot_metric(axes[0, 0], ttfp_data, "TTFP (ms)", "Time to First Audio Packet (TTFP)")
+    plot_metric(axes[0, 1], e2e_data, "E2E Latency (ms)", "End-to-End Latency (E2E)")
+    plot_metric(axes[1, 0], rtf_data, "RTF", "Real-Time Factor (RTF)", fmt=".3f")
+    plot_metric(axes[1, 1], throughput_data, "Audio-sec / Wall-sec", "Audio Throughput", fmt=".2f")
+
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=150, bbox_inches="tight")
+    print(f"Plot saved to {output_path}")
+    plt.close()
+
+
+def plot_single_summary(results: list[dict], label: str, output_path: str):
+    """Generate a single-config summary with percentile breakdown."""
+    concurrencies = [r["concurrency"] for r in results]
+
+    fig, axes = plt.subplots(1, 3, figsize=(16, 5))
+    fig.suptitle(f"Qwen3-TTS Benchmark - {label}", fontsize=15, fontweight="bold")
+
+    # TTFP breakdown
+    ax = axes[0]
+    means = [r["mean_ttfp_ms"] for r in results]
+    medians = [r["median_ttfp_ms"] for r in results]
+    p90s = [r["p90_ttfp_ms"] for r in results]
+    p99s = [r["p99_ttfp_ms"] for r in results]
+    x = np.arange(len(concurrencies))
+    w = 0.2
+    ax.bar(x - 1.5 * w, means, w, label="mean", color="#2196F3")
+    ax.bar(x - 0.5 * w, medians, w, label="median", color="#4CAF50")
+    ax.bar(x + 0.5 * w, p90s, w, label="p90", color="#FF9800")
+    ax.bar(x + 1.5 * w, p99s, w, label="p99", color="#F44336")
+    ax.set_xticks(x)
+    ax.set_xticklabels([str(c) for c in concurrencies])
+    ax.set_xlabel("Concurrency")
+    ax.set_ylabel("TTFP (ms)")
+    ax.set_title("Time to First Audio Packet")
+    ax.legend(fontsize=9)
+    ax.grid(axis="y", alpha=0.3)
+
+    # E2E breakdown
+    ax = axes[1]
+    means = [r["mean_e2e_ms"] for r in results]
+    medians = [r["median_e2e_ms"] for r in results]
+    p90s = [r["p90_e2e_ms"] for r in results]
+    p99s = [r["p99_e2e_ms"] for r in results]
+    ax.bar(x - 1.5 * w, means, w, label="mean", color="#2196F3")
+    ax.bar(x - 0.5 * w, medians, w, label="median", color="#4CAF50")
+    ax.bar(x + 0.5 * w, p90s, w, label="p90", color="#FF9800")
+    ax.bar(x + 1.5 * w, p99s, w, label="p99", color="#F44336")
+    ax.set_xticks(x)
+    ax.set_xticklabels([str(c) for c in concurrencies])
+    ax.set_xlabel("Concurrency")
+    ax.set_ylabel("E2E Latency (ms)")
+    ax.set_title("End-to-End Latency")
+    ax.legend(fontsize=9)
+    ax.grid(axis="y", alpha=0.3)
+
+    # RTF
+    ax = axes[2]
+    means = [r["mean_rtf"] for r in results]
+    medians = [r["median_rtf"] for r in results]
+    ax.bar(x - 0.15, means, 0.3, label="mean", color="#2196F3")
+    ax.bar(x + 0.15, medians, 0.3, label="median", color="#4CAF50")
+    ax.set_xticks(x)
+    ax.set_xticklabels([str(c) for c in concurrencies])
+    ax.set_xlabel("Concurrency")
+    ax.set_ylabel("RTF")
+    ax.set_title("Real-Time Factor")
+    ax.legend(fontsize=9)
+    ax.grid(axis="y", alpha=0.3)
+
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=150, bbox_inches="tight")
+    print(f"Plot saved to {output_path}")
+    plt.close()
+
+
+def print_comparison_table(all_results: list[list[dict]], labels: list[str]):
+    """Print a markdown-formatted comparison table."""
+    concurrencies = sorted(set(r["concurrency"] for r in all_results[0]))
+
+    print("\n## Benchmark Results\n")
+    header = "| Metric | Concurrency |"
+    sep = "| --- | --- |"
+    for label in labels:
+        header += f" {label} |"
+        sep += " --- |"
+    print(header)
+    print(sep)
+
+    for metric, key, fmt in [
+        ("TTFP (ms)", "mean_ttfp_ms", ".1f"),
+        ("E2E (ms)", "mean_e2e_ms", ".1f"),
+        ("RTF", "mean_rtf", ".3f"),
+        ("Throughput (audio-s/s)", "audio_throughput", ".2f"),
+    ]:
+        for c in concurrencies:
+            row = f"| {metric} | {c} |"
+            for results in all_results:
+                conc_map = {r["concurrency"]: r for r in results}
+                val = conc_map.get(c, {}).get(key, 0)
+                row += f" {val:{fmt}} |"
+            print(row)
+
+    # Improvement calculation (only if 2 configs)
+    if len(all_results) == 2:
+        print(f"\n## Improvement ({labels[0]} vs {labels[1]})\n")
+        print("| Metric | Concurrency | Improvement |")
+        print("| --- | --- | --- |")
+        for metric, key in [("TTFP", "mean_ttfp_ms"), ("E2E", "mean_e2e_ms"), ("RTF", "mean_rtf")]:
+            for c in concurrencies:
+                m0 = {r["concurrency"]: r for r in all_results[0]}
+                m1 = {r["concurrency"]: r for r in all_results[1]}
+                v0 = m0.get(c, {}).get(key, 0)
+                v1 = m1.get(c, {}).get(key, 0)
+                if v1 > 0:
+                    pct = (v1 - v0) / v1 * 100
+                    print(f"| {metric} | {c} | {pct:+.1f}% |")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Plot Qwen3-TTS benchmark results")
+    parser.add_argument(
+        "--results", type=str, nargs="+", required=True, help="Path(s) to result JSON files (one per config)"
+    )
+    parser.add_argument(
+        "--labels", type=str, nargs="+", required=True, help="Labels for each config (must match --results count)"
+    )
+    parser.add_argument("--output", type=str, default="results/qwen3_tts_benchmark.png", help="Output image path")
+    parser.add_argument("--title", type=str, default="Qwen3-TTS", help="Title prefix for the plot")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    assert len(args.results) == len(args.labels), "--results and --labels must have the same count"
+
+    all_results = load_results(args.results)
+    print_comparison_table(all_results, args.labels)
+
+    Path(args.output).parent.mkdir(parents=True, exist_ok=True)
+
+    if len(all_results) == 1:
+        plot_single_summary(all_results[0], args.labels[0], args.output)
+    else:
+        plot_comparison(all_results, args.labels, args.output, title_prefix=args.title)
diff --git a/benchmarks/qwen3-tts/results/.gitignore b/benchmarks/qwen3-tts/results/.gitignore
new file mode 100644
index 0000000000..5b6759ef71
--- /dev/null
+++ b/benchmarks/qwen3-tts/results/.gitignore
@@ -0,0 +1,3 @@
+# Benchmark results are machine-specific - do not commit
+*
+!.gitignore
diff --git a/benchmarks/qwen3-tts/run_benchmark.sh b/benchmarks/qwen3-tts/run_benchmark.sh
new file mode 100755
index 0000000000..ef85d64d6d
--- /dev/null
+++ b/benchmarks/qwen3-tts/run_benchmark.sh
@@ -0,0 +1,272 @@
+#!/bin/bash
+# Qwen3-TTS Benchmark Runner
+#
+# Compares vllm-omni streaming serving vs HuggingFace transformers offline inference.
+# Produces JSON results and comparison plots.
+#
+# Usage:
+#   # Full comparison (vllm-omni + HF):
+#   bash run_benchmark.sh
+#
+#   # Only vllm-omni async_chunk config:
+#   bash run_benchmark.sh --async-only
+#
+#   # Only HuggingFace baseline:
+#   bash run_benchmark.sh --hf-only
+#
+#   # vllm-omni only (skip HF):
+#   bash run_benchmark.sh --skip-hf
+#
+#   # Custom settings:
+#   GPU_DEVICE=1 NUM_PROMPTS=20 CONCURRENCY="1 4" bash run_benchmark.sh
+#
+#   # Use 1.7B model:
+#   MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice bash run_benchmark.sh --async-only
+#
+#   # Use batch_size=4 config:
+#   STAGE_CONFIG=vllm_omni/configs/qwen3_tts_bs4.yaml bash run_benchmark.sh --async-only
+#
+# Environment variables:
+#   GPU_DEVICE       - GPU index to use (default: 0)
+#   NUM_PROMPTS      - Number of prompts per concurrency level (default: 50)
+#   CONCURRENCY      - Space-separated concurrency levels (default: "1 4 10")
+#   MODEL            - Model name (default: Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice)
+#   PORT             - Server port (default: 8000)
+#   GPU_MEM_TALKER   - gpu_memory_utilization for talker stage (default: 0.3)
+#   GPU_MEM_CODE2WAV - gpu_memory_utilization for code2wav stage (default: 0.2)
+#   STAGE_CONFIG     - Path to stage config YAML (default: configs/qwen3_tts_bs1.yaml)
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+
+# Defaults
+GPU_DEVICE="${GPU_DEVICE:-0}"
+NUM_PROMPTS="${NUM_PROMPTS:-50}"
+CONCURRENCY="${CONCURRENCY:-1 4 10}"
+MODEL="${MODEL:-Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice}"
+PORT="${PORT:-8000}"
+GPU_MEM_TALKER="${GPU_MEM_TALKER:-0.3}"
+GPU_MEM_CODE2WAV="${GPU_MEM_CODE2WAV:-0.2}"
+NUM_WARMUPS="${NUM_WARMUPS:-3}"
+STAGE_CONFIG="${STAGE_CONFIG:-vllm_omni/configs/qwen3_tts_bs1.yaml}"
+RESULT_DIR="${SCRIPT_DIR}/results"
+TIMESTAMP="$(date +%Y%m%d_%H%M%S)"
+
+# Parse args
+RUN_ASYNC=true
+RUN_HF=true
+for arg in "$@"; do
+    case "$arg" in
+        --async-only) RUN_HF=false ;;
+        --hf-only) RUN_ASYNC=false ;;
+        --skip-hf) RUN_HF=false ;;
+    esac
+done
+
+mkdir -p "${RESULT_DIR}"
+
+echo "============================================================"
+echo " Qwen3-TTS Benchmark"
+echo "============================================================"
+echo " GPU:          ${GPU_DEVICE}"
+echo " Model:        ${MODEL}"
+echo " Prompts:      ${NUM_PROMPTS}"
+echo " Concurrency:  ${CONCURRENCY}"
+echo " Port:         ${PORT}"
+echo " Stage config: ${STAGE_CONFIG}"
+echo " Results:      ${RESULT_DIR}"
+echo "============================================================"
+
+# Prepare stage config with correct GPU device and memory settings
+prepare_config() {
+    local config_template="$1"
+    local config_name="$2"
+    local output_path="${RESULT_DIR}/${config_name}_stage_config.yaml"
+
+    # Use sed to patch GPU device and memory utilization
+    sed \
+        -e "s/devices: \"0\"/devices: \"${GPU_DEVICE}\"/g" \
+        -e "s/gpu_memory_utilization: 0.3/gpu_memory_utilization: ${GPU_MEM_TALKER}/g" \
+        -e "s/gpu_memory_utilization: 0.2/gpu_memory_utilization: ${GPU_MEM_CODE2WAV}/g" \
+        "${config_template}" > "${output_path}"
+
+    echo "${output_path}"
+}
+
+# Start server and wait for it to be ready
+start_server() {
+    local stage_config="$1"
+    local config_name="$2"
+    local log_file="${RESULT_DIR}/server_${config_name}_${TIMESTAMP}.log"
+
+    echo ""
+    echo "Starting server with config: ${config_name}"
+    echo "  Stage config: ${stage_config}"
+    echo "  Log file: ${log_file}"
+
+    VLLM_WORKER_MULTIPROC_METHOD=spawn \
+    CUDA_VISIBLE_DEVICES="${GPU_DEVICE}" \
+    python -m vllm_omni.entrypoints.cli.main serve "${MODEL}" \
+        --omni \
+        --host 127.0.0.1 \
+        --port "${PORT}" \
+        --stage-configs-path "${stage_config}" \
+        --stage-init-timeout 120 \
+        --trust-remote-code \
+        --disable-log-stats \
+        > "${log_file}" 2>&1 &
+
+    SERVER_PID=$!
+    echo "  Server PID: ${SERVER_PID}"
+
+    # Wait for server to be ready
+    echo "  Waiting for server to be ready..."
+    local max_wait=300
+    local waited=0
+    while [ ${waited} -lt ${max_wait} ]; do
+        if curl -sf "http://127.0.0.1:${PORT}/v1/models" > /dev/null 2>&1; then
+            echo "  Server is ready! (waited ${waited}s)"
+            return 0
+        fi
+        # Check if process is still alive
+        if ! kill -0 ${SERVER_PID} 2>/dev/null; then
+            echo "  ERROR: Server process died. Check log: ${log_file}"
+            tail -20 "${log_file}"
+            return 1
+        fi
+        sleep 2
+        waited=$((waited + 2))
+    done
+
+    echo "  ERROR: Server did not start within ${max_wait}s. Check log: ${log_file}"
+    kill ${SERVER_PID} 2>/dev/null || true
+    return 1
+}
+
+# Stop the server
+stop_server() {
+    if [ -n "${SERVER_PID:-}" ]; then
+        echo "  Stopping server (PID: ${SERVER_PID})..."
+        kill ${SERVER_PID} 2>/dev/null || true
+        wait ${SERVER_PID} 2>/dev/null || true
+        # Kill any remaining child processes on the port
+        local pids
+        pids=$(lsof -ti:${PORT} 2>/dev/null || true)
+        if [ -n "${pids}" ]; then
+            echo "  Cleaning up remaining processes on port ${PORT}..."
+            echo "${pids}" | xargs kill -9 2>/dev/null || true
+        fi
+        echo "  Server stopped."
+        SERVER_PID=""
+    fi
+}
+
+# Cleanup on exit
+trap 'stop_server' EXIT
+
+# Run benchmark for a given config
+run_bench() {
+    local config_name="$1"
+    local config_template="$2"
+
+    echo ""
+    echo "============================================================"
+    echo " Benchmarking: ${config_name}"
+    echo "============================================================"
+
+    local stage_config
+    stage_config=$(prepare_config "${config_template}" "${config_name}")
+
+    start_server "${stage_config}" "${config_name}"
+
+    # Convert concurrency string to args
+    local conc_args=""
+    for c in ${CONCURRENCY}; do
+        conc_args="${conc_args} ${c}"
+    done
+
+    cd "${PROJECT_ROOT}"
+    python "${SCRIPT_DIR}/vllm_omni/bench_tts_serve.py" \
+        --host 127.0.0.1 \
+        --port "${PORT}" \
+        --num-prompts "${NUM_PROMPTS}" \
+        --max-concurrency ${conc_args} \
+        --num-warmups "${NUM_WARMUPS}" \
+        --config-name "${config_name}" \
+        --result-dir "${RESULT_DIR}"
+
+    stop_server
+
+    # Allow GPU memory to settle
+    sleep 5
+}
+
+# Run vllm-omni benchmark
+if [ "${RUN_ASYNC}" = true ]; then
+    run_bench "async_chunk" "${SCRIPT_DIR}/${STAGE_CONFIG}"
+fi
+
+# Run HuggingFace baseline benchmark
+if [ "${RUN_HF}" = true ]; then
+    echo ""
+    echo "============================================================"
+    echo " Benchmarking: HuggingFace transformers (offline)"
+    echo "============================================================"
+
+    cd "${PROJECT_ROOT}"
+    python "${SCRIPT_DIR}/transformers/bench_tts_hf.py" \
+        --model "${MODEL}" \
+        --num-prompts "${NUM_PROMPTS}" \
+        --num-warmups "${NUM_WARMUPS}" \
+        --gpu-device "${GPU_DEVICE}" \
+        --config-name "hf_transformers" \
+        --result-dir "${RESULT_DIR}"
+
+    # Allow GPU memory to settle
+    sleep 5
+fi
+
+# Plot results
+echo ""
+echo "============================================================"
+echo " Generating plots..."
+echo "============================================================"
+
+RESULT_FILES=""
+LABELS=""
+
+if [ "${RUN_ASYNC}" = true ]; then
+    ASYNC_FILE=$(ls -t "${RESULT_DIR}"/bench_async_chunk_*.json 2>/dev/null | head -1)
+    if [ -n "${ASYNC_FILE}" ]; then
+        RESULT_FILES="${ASYNC_FILE}"
+        LABELS="async_chunk"
+    fi
+fi
+
+if [ "${RUN_HF}" = true ]; then
+    HF_FILE=$(ls -t "${RESULT_DIR}"/bench_hf_transformers_*.json 2>/dev/null | head -1)
+    if [ -n "${HF_FILE}" ]; then
+        if [ -n "${RESULT_FILES}" ]; then
+            RESULT_FILES="${RESULT_FILES} ${HF_FILE}"
+            LABELS="${LABELS} hf_transformers"
+        else
+            RESULT_FILES="${HF_FILE}"
+            LABELS="hf_transformers"
+        fi
+    fi
+fi
+
+if [ -n "${RESULT_FILES}" ]; then
+    python "${SCRIPT_DIR}/plot_results.py" \
+        --results ${RESULT_FILES} \
+        --labels ${LABELS} \
+        --output "${RESULT_DIR}/qwen3_tts_benchmark_${TIMESTAMP}.png"
+fi
+
+echo ""
+echo "============================================================"
+echo " Benchmark complete!"
+echo " Results: ${RESULT_DIR}"
+echo "============================================================"
diff --git a/benchmarks/qwen3-tts/transformers/bench_tts_hf.py b/benchmarks/qwen3-tts/transformers/bench_tts_hf.py
new file mode 100644
index 0000000000..63cdef6d58
--- /dev/null
+++ b/benchmarks/qwen3-tts/transformers/bench_tts_hf.py
@@ -0,0 +1,256 @@
+"""Benchmark Qwen3-TTS using HuggingFace transformers (qwen_tts library).
+
+Measures E2E latency, RTF, and audio duration for offline (non-serving) inference.
+Results are saved in the same JSON format as bench_tts_serve.py for unified plotting.
+
+Usage:
+    python bench_tts_hf.py \
+        --model Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice \
+        --num-prompts 50 \
+        --num-warmups 3 \
+        --gpu-device 0 \
+        --result-dir results/
+"""
+
+import argparse
+import json
+import time
+from dataclasses import asdict, dataclass, field
+from datetime import datetime
+from pathlib import Path
+
+import numpy as np
+import soundfile as sf
+import torch
+
+PROMPTS = [
+    "Hello, welcome to the voice synthesis benchmark test.",
+    "She said she would be here by noon, but nobody showed up.",
+    "The quick brown fox jumps over the lazy dog near the riverbank.",
+    "I can't believe how beautiful the sunset looks from up here on the mountain.",
+    "Please remember to bring your identification documents to the appointment tomorrow morning.",
+    "Have you ever wondered what it would be like to travel through time and visit ancient civilizations?",
+    "The restaurant on the corner serves the best pasta I have ever tasted in my entire life.",
+    "After the meeting, we should discuss the quarterly results and plan for the next phase.",
+    "Learning a new language takes patience, practice, and a genuine curiosity about other cultures.",
+    "The train leaves at half past seven, so we need to arrive at the station before then.",
+    "Could you please turn down the music a little bit, I'm trying to concentrate on my work.",
+    "It was a dark and stormy night when the old lighthouse keeper heard a knock at the door.",
+]
+
+
+@dataclass
+class BenchmarkResult:
+    config_name: str = ""
+    concurrency: int = 1  # always 1 for offline
+    num_prompts: int = 0
+    completed: int = 0
+    failed: int = 0
+    duration_s: float = 0.0
+    # TTFP stats - not applicable for HF offline, set to E2E for compatibility
+    mean_ttfp_ms: float = 0.0
+    median_ttfp_ms: float = 0.0
+    std_ttfp_ms: float = 0.0
+    p90_ttfp_ms: float = 0.0
+    p95_ttfp_ms: float = 0.0
+    p99_ttfp_ms: float = 0.0
+    # E2E stats (ms)
+    mean_e2e_ms: float = 0.0
+    median_e2e_ms: float = 0.0
+    std_e2e_ms: float = 0.0
+    p90_e2e_ms: float = 0.0
+    p95_e2e_ms: float = 0.0
+    p99_e2e_ms: float = 0.0
+    # RTF stats
+    mean_rtf: float = 0.0
+    median_rtf: float = 0.0
+    std_rtf: float = 0.0
+    # Audio stats
+    mean_audio_duration_s: float = 0.0
+    total_audio_duration_s: float = 0.0
+    audio_throughput: float = 0.0
+    request_throughput: float = 0.0
+    # Per-request details
+    per_request: list = field(default_factory=list)
+
+
+def run_benchmark(args):
+    from qwen_tts import Qwen3TTSModel
+
+    device = f"cuda:{args.gpu_device}"
+    print(f"Loading model: {args.model} on {device}")
+    model = Qwen3TTSModel.from_pretrained(
+        args.model,
+        device_map=device,
+        dtype=torch.bfloat16,
+    )
+    print("Model loaded.")
+
+    # Build prompt list
+    prompts = [PROMPTS[i % len(PROMPTS)] for i in range(args.num_prompts)]
+
+    # Warmup
+    if args.num_warmups > 0:
+        print(f"Warming up with {args.num_warmups} requests...")
+        for i in range(args.num_warmups):
+            p = PROMPTS[i % len(PROMPTS)]
+            wavs, sr = model.generate_custom_voice(
+                text=p,
+                language=args.language,
+                speaker=args.voice,
+            )
+        # Sync GPU
+        torch.cuda.synchronize(device)
+        print("Warmup done.")
+
+    # Benchmark
+    print(f"Running {args.num_prompts} requests sequentially...")
+    e2e_times = []
+    rtfs = []
+    audio_durations = []
+    per_request = []
+    failed = 0
+
+    audio_dir = None
+    if args.save_audio:
+        audio_dir = Path(args.result_dir) / "audio_hf"
+        audio_dir.mkdir(parents=True, exist_ok=True)
+
+    total_start = time.perf_counter()
+
+    for i, prompt in enumerate(prompts):
+        try:
+            torch.cuda.synchronize(device)
+            st = time.perf_counter()
+
+            wavs, sr = model.generate_custom_voice(
+                text=prompt,
+                language=args.language,
+                speaker=args.voice,
+            )
+
+            torch.cuda.synchronize(device)
+            elapsed = time.perf_counter() - st
+
+            # Compute audio duration
+            audio_samples = wavs[0]
+            if isinstance(audio_samples, torch.Tensor):
+                audio_samples = audio_samples.cpu().numpy()
+            audio_dur = len(audio_samples) / sr
+
+            rtf = elapsed / audio_dur if audio_dur > 0 else 0.0
+
+            e2e_times.append(elapsed)
+            rtfs.append(rtf)
+            audio_durations.append(audio_dur)
+            per_request.append(
+                {
+                    "e2e_ms": elapsed * 1000,
+                    "ttfp_ms": elapsed * 1000,  # no streaming, TTFP = E2E
+                    "rtf": rtf,
+                    "audio_duration_s": audio_dur,
+                    "prompt": prompt,
+                }
+            )
+
+            if audio_dir:
+                sf.write(str(audio_dir / f"output_{i:04d}.wav"), audio_samples, sr)
+
+            if (i + 1) % 10 == 0 or i == 0:
+                print(
+                    f"  [{i + 1}/{args.num_prompts}] e2e={elapsed * 1000:.0f}ms  rtf={rtf:.3f}  audio={audio_dur:.2f}s"
+                )
+
+        except Exception as e:
+            print(f"  [{i + 1}/{args.num_prompts}] FAILED: {e}")
+            failed += 1
+
+    total_duration = time.perf_counter() - total_start
+    completed = len(e2e_times)
+
+    # Compute stats
+    result = BenchmarkResult(
+        config_name=args.config_name,
+        concurrency=1,
+        num_prompts=args.num_prompts,
+        completed=completed,
+        failed=failed,
+        duration_s=total_duration,
+    )
+
+    if e2e_times:
+        e2e_ms = [t * 1000 for t in e2e_times]
+
+        result.mean_e2e_ms = float(np.mean(e2e_ms))
+        result.median_e2e_ms = float(np.median(e2e_ms))
+        result.std_e2e_ms = float(np.std(e2e_ms))
+        result.p90_e2e_ms = float(np.percentile(e2e_ms, 90))
+        result.p95_e2e_ms = float(np.percentile(e2e_ms, 95))
+        result.p99_e2e_ms = float(np.percentile(e2e_ms, 99))
+
+        # For HF offline, TTFP = E2E (no streaming)
+        result.mean_ttfp_ms = result.mean_e2e_ms
+        result.median_ttfp_ms = result.median_e2e_ms
+        result.std_ttfp_ms = result.std_e2e_ms
+        result.p90_ttfp_ms = result.p90_e2e_ms
+        result.p95_ttfp_ms = result.p95_e2e_ms
+        result.p99_ttfp_ms = result.p99_e2e_ms
+
+        result.mean_rtf = float(np.mean(rtfs))
+        result.median_rtf = float(np.median(rtfs))
+        result.std_rtf = float(np.std(rtfs))
+
+        result.mean_audio_duration_s = float(np.mean(audio_durations))
+        result.total_audio_duration_s = float(np.sum(audio_durations))
+        result.audio_throughput = result.total_audio_duration_s / total_duration
+        result.request_throughput = completed / total_duration
+        result.per_request = per_request
+
+    # Print summary
+    print(f"\n{'=' * 60}")
+    print(f"  HuggingFace Offline Benchmark: {args.config_name}")
+    print(f"  Completed: {completed}  |  Failed: {failed}")
+    print(f"  Total duration: {total_duration:.2f}s")
+    print(
+        f"  {'E2E (ms):':<25} mean={result.mean_e2e_ms:.1f}  median={result.median_e2e_ms:.1f}"
+        f"  p90={result.p90_e2e_ms:.1f}  p99={result.p99_e2e_ms:.1f}"
+    )
+    print(f"  {'RTF:':<25} mean={result.mean_rtf:.3f}  median={result.median_rtf:.3f}")
+    print(f"  {'Audio throughput:':<25} {result.audio_throughput:.2f} audio-sec/wall-sec")
+    print(f"  {'Request throughput:':<25} {result.request_throughput:.2f} req/s")
+    print(f"{'=' * 60}\n")
+
+    # Save results (as a list with single concurrency=1 entry, matching serve format)
+    result_dir = Path(args.result_dir)
+    result_dir.mkdir(parents=True, exist_ok=True)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    result_file = result_dir / f"bench_{args.config_name}_{timestamp}.json"
+
+    with open(result_file, "w") as f:
+        json.dump([asdict(result)], f, indent=2)
+    print(f"Results saved to {result_file}")
+
+    return result
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Qwen3-TTS HuggingFace Benchmark")
+    parser.add_argument(
+        "--model", type=str, default="Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice", help="HuggingFace model name or path"
+    )
+    parser.add_argument("--num-prompts", type=int, default=50)
+    parser.add_argument("--num-warmups", type=int, default=3)
+    parser.add_argument("--gpu-device", type=int, default=0)
+    parser.add_argument("--voice", type=str, default="Vivian")
+    parser.add_argument("--language", type=str, default="English")
+    parser.add_argument(
+        "--config-name", type=str, default="hf_transformers", help="Label for this config (used in filenames)"
+    )
+    parser.add_argument("--result-dir", type=str, default="results")
+    parser.add_argument("--save-audio", action="store_true", help="Save generated audio files")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    run_benchmark(args)
diff --git a/benchmarks/qwen3-tts/vllm_omni/bench_tts_serve.py b/benchmarks/qwen3-tts/vllm_omni/bench_tts_serve.py
new file mode 100644
index 0000000000..d934969283
--- /dev/null
+++ b/benchmarks/qwen3-tts/vllm_omni/bench_tts_serve.py
@@ -0,0 +1,325 @@
+"""Benchmark client for Qwen3-TTS via /v1/audio/speech endpoint.
+
+Measures TTFP (Time-to-First-Packet), E2E latency, and RTF (Real-Time Factor)
+across configurable concurrency levels. Saves results as JSON for plotting.
+
+Usage:
+    python bench_tts_serve.py \
+        --host 127.0.0.1 --port 8000 \
+        --num-prompts 50 \
+        --max-concurrency 1 4 10 \
+        --result-dir results/
+"""
+
+import argparse
+import asyncio
+import json
+import time
+from dataclasses import asdict, dataclass, field
+from datetime import datetime
+from pathlib import Path
+
+import aiohttp
+import numpy as np
+from tqdm.asyncio import tqdm
+
+PROMPTS = [
+    "Hello, welcome to the voice synthesis benchmark test.",
+    "She said she would be here by noon, but nobody showed up.",
+    "The quick brown fox jumps over the lazy dog near the riverbank.",
+    "I can't believe how beautiful the sunset looks from up here on the mountain.",
+    "Please remember to bring your identification documents to the appointment tomorrow morning.",
+    "Have you ever wondered what it would be like to travel through time and visit ancient civilizations?",
+    "The restaurant on the corner serves the best pasta I have ever tasted in my entire life.",
+    "After the meeting, we should discuss the quarterly results and plan for the next phase.",
+    "Learning a new language takes patience, practice, and a genuine curiosity about other cultures.",
+    "The train leaves at half past seven, so we need to arrive at the station before then.",
+    "Could you please turn down the music a little bit, I'm trying to concentrate on my work.",
+    "It was a dark and stormy night when the old lighthouse keeper heard a knock at the door.",
+]
+
+
+@dataclass
+class RequestResult:
+    success: bool = False
+    ttfp: float = 0.0  # Time to first audio packet (seconds)
+    e2e: float = 0.0  # End-to-end latency (seconds)
+    audio_bytes: int = 0  # Total audio bytes received
+    audio_duration: float = 0.0  # Audio duration in seconds (estimated from PCM)
+    rtf: float = 0.0  # Real-time factor = e2e / audio_duration
+    prompt: str = ""
+    error: str = ""
+
+
+@dataclass
+class BenchmarkResult:
+    config_name: str = ""
+    concurrency: int = 0
+    num_prompts: int = 0
+    completed: int = 0
+    failed: int = 0
+    duration_s: float = 0.0
+    # TTFP stats (ms)
+    mean_ttfp_ms: float = 0.0
+    median_ttfp_ms: float = 0.0
+    std_ttfp_ms: float = 0.0
+    p90_ttfp_ms: float = 0.0
+    p95_ttfp_ms: float = 0.0
+    p99_ttfp_ms: float = 0.0
+    # E2E stats (ms)
+    mean_e2e_ms: float = 0.0
+    median_e2e_ms: float = 0.0
+    std_e2e_ms: float = 0.0
+    p90_e2e_ms: float = 0.0
+    p95_e2e_ms: float = 0.0
+    p99_e2e_ms: float = 0.0
+    # RTF stats
+    mean_rtf: float = 0.0
+    median_rtf: float = 0.0
+    std_rtf: float = 0.0
+    # Audio stats
+    mean_audio_duration_s: float = 0.0
+    total_audio_duration_s: float = 0.0
+    audio_throughput: float = 0.0  # audio_duration / wall_time
+    request_throughput: float = 0.0  # requests / second
+    # Per-request details
+    per_request: list = field(default_factory=list)
+
+
+def pcm_bytes_to_duration(num_bytes: int, sample_rate: int = 24000, sample_width: int = 2) -> float:
+    """Convert raw PCM byte count to duration in seconds."""
+    num_samples = num_bytes / sample_width
+    return num_samples / sample_rate
+
+
+async def send_tts_request(
+    session: aiohttp.ClientSession,
+    api_url: str,
+    prompt: str,
+    voice: str = "vivian",
+    language: str = "English",
+    pbar: tqdm | None = None,
+) -> RequestResult:
+    """Send a streaming TTS request and measure latency metrics."""
+    payload = {
+        "input": prompt,
+        "voice": voice,
+        "language": language,
+        "stream": True,
+        "response_format": "pcm",
+    }
+
+    result = RequestResult(prompt=prompt)
+    st = time.perf_counter()
+
+    try:
+        async with session.post(api_url, json=payload) as response:
+            if response.status != 200:
+                result.error = f"HTTP {response.status}: {await response.text()}"
+                result.success = False
+                return result
+
+            first_chunk = True
+            total_bytes = 0
+
+            async for chunk in response.content.iter_any():
+                if first_chunk and len(chunk) > 0:
+                    result.ttfp = time.perf_counter() - st
+                    first_chunk = False
+                total_bytes += len(chunk)
+
+            result.e2e = time.perf_counter() - st
+            result.audio_bytes = total_bytes
+            result.audio_duration = pcm_bytes_to_duration(total_bytes)
+
+            if result.audio_duration > 0:
+                result.rtf = result.e2e / result.audio_duration
+            result.success = True
+
+    except Exception as e:
+        result.error = str(e)
+        result.success = False
+        result.e2e = time.perf_counter() - st
+
+    if pbar:
+        pbar.update(1)
+    return result
+
+
+async def run_benchmark(
+    host: str,
+    port: int,
+    num_prompts: int,
+    max_concurrency: int,
+    num_warmups: int = 3,
+    voice: str = "vivian",
+    language: str = "English",
+) -> BenchmarkResult:
+    """Run benchmark at a given concurrency level."""
+    api_url = f"http://{host}:{port}/v1/audio/speech"
+
+    connector = aiohttp.TCPConnector(
+        limit=max_concurrency,
+        limit_per_host=max_concurrency,
+        keepalive_timeout=60,
+    )
+    session = aiohttp.ClientSession(
+        connector=connector,
+        timeout=aiohttp.ClientTimeout(total=600),
+    )
+
+    # Warmup
+    if num_warmups > 0:
+        print(f"  Warming up with {num_warmups} requests...")
+        warmup_tasks = []
+        for i in range(num_warmups):
+            prompt = PROMPTS[i % len(PROMPTS)]
+            warmup_tasks.append(send_tts_request(session, api_url, prompt, voice, language))
+        await asyncio.gather(*warmup_tasks)
+        print("  Warmup done.")
+
+    # Build request list
+    request_prompts = [PROMPTS[i % len(PROMPTS)] for i in range(num_prompts)]
+
+    # Run benchmark
+    print(f"  Running {num_prompts} requests with concurrency={max_concurrency}...")
+    semaphore = asyncio.Semaphore(max_concurrency)
+    pbar = tqdm(total=num_prompts, desc=f"  concurrency={max_concurrency}")
+
+    async def limited_request(prompt):
+        async with semaphore:
+            return await send_tts_request(session, api_url, prompt, voice, language, pbar)
+
+    start_time = time.perf_counter()
+    tasks = [asyncio.create_task(limited_request(p)) for p in request_prompts]
+    results: list[RequestResult] = await asyncio.gather(*tasks)
+    duration = time.perf_counter() - start_time
+    pbar.close()
+
+    await session.close()
+
+    # Compute stats
+    successful = [r for r in results if r.success]
+    failed = [r for r in results if not r.success]
+
+    bench = BenchmarkResult(
+        concurrency=max_concurrency,
+        num_prompts=num_prompts,
+        completed=len(successful),
+        failed=len(failed),
+        duration_s=duration,
+    )
+
+    if successful:
+        ttfps = [r.ttfp * 1000 for r in successful]  # convert to ms
+        e2es = [r.e2e * 1000 for r in successful]
+        rtfs = [r.rtf for r in successful]
+        audio_durs = [r.audio_duration for r in successful]
+
+        bench.mean_ttfp_ms = float(np.mean(ttfps))
+        bench.median_ttfp_ms = float(np.median(ttfps))
+        bench.std_ttfp_ms = float(np.std(ttfps))
+        bench.p90_ttfp_ms = float(np.percentile(ttfps, 90))
+        bench.p95_ttfp_ms = float(np.percentile(ttfps, 95))
+        bench.p99_ttfp_ms = float(np.percentile(ttfps, 99))
+
+        bench.mean_e2e_ms = float(np.mean(e2es))
+        bench.median_e2e_ms = float(np.median(e2es))
+        bench.std_e2e_ms = float(np.std(e2es))
+        bench.p90_e2e_ms = float(np.percentile(e2es, 90))
+        bench.p95_e2e_ms = float(np.percentile(e2es, 95))
+        bench.p99_e2e_ms = float(np.percentile(e2es, 99))
+
+        bench.mean_rtf = float(np.mean(rtfs))
+        bench.median_rtf = float(np.median(rtfs))
+        bench.std_rtf = float(np.std(rtfs))
+
+        bench.mean_audio_duration_s = float(np.mean(audio_durs))
+        bench.total_audio_duration_s = float(np.sum(audio_durs))
+        bench.audio_throughput = bench.total_audio_duration_s / duration
+        bench.request_throughput = len(successful) / duration
+
+        bench.per_request = [
+            {
+                "ttfp_ms": r.ttfp * 1000,
+                "e2e_ms": r.e2e * 1000,
+                "rtf": r.rtf,
+                "audio_duration_s": r.audio_duration,
+                "prompt": r.prompt,
+            }
+            for r in successful
+        ]
+
+    # Print summary
+    print(f"\n{'=' * 60}")
+    print(f"  Concurrency: {max_concurrency}  |  Completed: {bench.completed}  |  Failed: {bench.failed}")
+    print(f"  Duration: {duration:.2f}s  |  Throughput: {bench.request_throughput:.2f} req/s")
+    print(
+        f"  {'TTFP (ms):':<25} mean={bench.mean_ttfp_ms:.1f}  median={bench.median_ttfp_ms:.1f}"
+        f"  p90={bench.p90_ttfp_ms:.1f}  p99={bench.p99_ttfp_ms:.1f}"
+    )
+    print(
+        f"  {'E2E (ms):':<25} mean={bench.mean_e2e_ms:.1f}  median={bench.median_e2e_ms:.1f}"
+        f"  p90={bench.p90_e2e_ms:.1f}  p99={bench.p99_e2e_ms:.1f}"
+    )
+    print(f"  {'RTF:':<25} mean={bench.mean_rtf:.3f}  median={bench.median_rtf:.3f}")
+    print(f"  {'Audio throughput:':<25} {bench.audio_throughput:.2f} audio-sec/wall-sec")
+    print(f"{'=' * 60}\n")
+
+    if failed:
+        for r in failed[:3]:
+            print(f"  [ERROR] {r.error[:200]}")
+
+    return bench
+
+
+async def main(args):
+    all_results = []
+
+    for concurrency in args.max_concurrency:
+        result = await run_benchmark(
+            host=args.host,
+            port=args.port,
+            num_prompts=args.num_prompts,
+            max_concurrency=concurrency,
+            num_warmups=args.num_warmups,
+            voice=args.voice,
+            language=args.language,
+        )
+        result.config_name = args.config_name
+        all_results.append(asdict(result))
+
+    # Save results
+    result_dir = Path(args.result_dir)
+    result_dir.mkdir(parents=True, exist_ok=True)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    result_file = result_dir / f"bench_{args.config_name}_{timestamp}.json"
+
+    with open(result_file, "w") as f:
+        json.dump(all_results, f, indent=2)
+    print(f"Results saved to {result_file}")
+
+    return all_results
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Qwen3-TTS Benchmark Client")
+    parser.add_argument("--host", type=str, default="127.0.0.1")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--num-prompts", type=int, default=50, help="Number of prompts per concurrency level")
+    parser.add_argument(  # noqa: E501
+        "--max-concurrency", type=int, nargs="+", default=[1, 4, 10], help="Concurrency levels to test"
+    )
+    parser.add_argument("--num-warmups", type=int, default=3)
+    parser.add_argument("--voice", type=str, default="vivian")
+    parser.add_argument("--language", type=str, default="English")
+    parser.add_argument(
+        "--config-name", type=str, default="async_chunk", help="Label for this config (used in filenames)"
+    )
+    parser.add_argument("--result-dir", type=str, default="results")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    asyncio.run(main(args))
diff --git a/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs1.yaml b/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs1.yaml
new file mode 100644
index 0000000000..1597f8aa24
--- /dev/null
+++ b/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs1.yaml
@@ -0,0 +1,96 @@
+# Qwen3-TTS batch_size=1 config (streaming with async_chunk)
+# 2-stage pipeline: Talker -> Code2Wav
+async_chunk: true
+stage_args:
+  - stage_id: 0
+    stage_type: llm
+    runtime:
+      devices: "0"
+      max_batch_size: 1
+    engine_args:
+      model_stage: qwen3_tts
+      model_arch: Qwen3TTSTalkerForConditionalGeneration
+      hf_overrides:
+        architectures: [Qwen3TTSTalkerForConditionalGeneration]
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      enforce_eager: false
+      trust_remote_code: true
+      async_scheduling: false
+      enable_prefix_caching: false
+      engine_output_type: latent
+      gpu_memory_utilization: 0.3
+      distributed_executor_backend: "mp"
+      max_num_batched_tokens: 512
+      max_model_len: 4096
+      custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk
+    output_connectors:
+      to_stage_1: connector_of_shared_memory
+    default_sampling_params:
+      temperature: 0.9
+      top_k: 50
+      max_tokens: 4096
+      seed: 42
+      detokenize: false
+      repetition_penalty: 1.05
+      stop_token_ids: [2150]
+
+  - stage_id: 1
+    stage_type: llm
+    runtime:
+      devices: "0"
+      max_batch_size: 1
+    engine_args:
+      model_stage: code2wav
+      model_arch: Qwen3TTSCode2Wav
+      hf_overrides:
+        architectures: [Qwen3TTSCode2Wav]
+      worker_type: generation
+      scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
+      enforce_eager: true
+      trust_remote_code: true
+      async_scheduling: false
+      enable_prefix_caching: false
+      engine_output_type: audio
+      gpu_memory_utilization: 0.2
+      distributed_executor_backend: "mp"
+      max_num_batched_tokens: 8192
+      max_model_len: 32768
+    engine_input_source: [0]
+    final_output: true
+    final_output_type: audio
+    input_connectors:
+      from_stage_0: connector_of_shared_memory
+    tts_args:
+      max_instructions_length: 500
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 65536
+      seed: 42
+      detokenize: true
+      repetition_penalty: 1.0
+
+runtime:
+  enabled: true
+  defaults:
+    window_size: -1
+    max_inflight: 1
+
+  connectors:
+    connector_of_shared_memory:
+      name: SharedMemoryConnector
+      extra:
+        shm_threshold_bytes: 65536
+        codec_streaming: true
+        connector_get_sleep_s: 0.01
+        connector_get_max_wait_first_chunk: 3000
+        connector_get_max_wait: 300
+        codec_chunk_frames: 25
+        codec_left_context_frames: 25
+
+  edges:
+    - from: 0
+      to: 1
+      window_size: -1
diff --git a/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs4.yaml b/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs4.yaml
new file mode 100644
index 0000000000..18493f3aee
--- /dev/null
+++ b/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs4.yaml
@@ -0,0 +1,97 @@
+# Qwen3-TTS batch_size=4 config (streaming with async_chunk)
+# Enables concurrent request processing with max_inflight=4
+# 2-stage pipeline: Talker -> Code2Wav
+async_chunk: true
+stage_args:
+  - stage_id: 0
+    stage_type: llm
+    runtime:
+      devices: "0"
+      max_batch_size: 4
+    engine_args:
+      model_stage: qwen3_tts
+      model_arch: Qwen3TTSTalkerForConditionalGeneration
+      hf_overrides:
+        architectures: [Qwen3TTSTalkerForConditionalGeneration]
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      enforce_eager: false
+      trust_remote_code: true
+      async_scheduling: false
+      enable_prefix_caching: false
+      engine_output_type: latent
+      gpu_memory_utilization: 0.3
+      distributed_executor_backend: "mp"
+      max_num_batched_tokens: 512
+      max_model_len: 4096
+      custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk
+    output_connectors:
+      to_stage_1: connector_of_shared_memory
+    default_sampling_params:
+      temperature: 0.9
+      top_k: 50
+      max_tokens: 4096
+      seed: 42
+      detokenize: false
+      repetition_penalty: 1.05
+      stop_token_ids: [2150]
+
+  - stage_id: 1
+    stage_type: llm
+    runtime:
+      devices: "0"
+      max_batch_size: 4
+    engine_args:
+      model_stage: code2wav
+      model_arch: Qwen3TTSCode2Wav
+      hf_overrides:
+        architectures: [Qwen3TTSCode2Wav]
+      worker_type: generation
+      scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
+      enforce_eager: true
+      trust_remote_code: true
+      async_scheduling: false
+      enable_prefix_caching: false
+      engine_output_type: audio
+      gpu_memory_utilization: 0.2
+      distributed_executor_backend: "mp"
+      max_num_batched_tokens: 8192
+      max_model_len: 32768
+    engine_input_source: [0]
+    final_output: true
+    final_output_type: audio
+    input_connectors:
+      from_stage_0: connector_of_shared_memory
+    tts_args:
+      max_instructions_length: 500
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 65536
+      seed: 42
+      detokenize: true
+      repetition_penalty: 1.0
+
+runtime:
+  enabled: true
+  defaults:
+    window_size: -1
+    max_inflight: 4
+
+  connectors:
+    connector_of_shared_memory:
+      name: SharedMemoryConnector
+      extra:
+        shm_threshold_bytes: 65536
+        codec_streaming: true
+        connector_get_sleep_s: 0.01
+        connector_get_max_wait_first_chunk: 3000
+        connector_get_max_wait: 300
+        codec_chunk_frames: 25
+        codec_left_context_frames: 25
+
+  edges:
+    - from: 0
+      to: 1
+      window_size: -1