From 6dd5ad26bdd72904e6c5bd97928f64c1deb65c67 Mon Sep 17 00:00:00 2001
From: Roman Koshkin <roman.koshkin@sbintuitions.co.jp>
Date: Sat, 21 Feb 2026 09:06:34 +0900
Subject: [PATCH 1/4] [Bugfix] (qwen3_tts): enable batched offline inference by
 fixing tensor slicing

Signed-off-by: Roman Koshkin <roman.koshkin@sbintuitions.co.jp>
---
 .gitignore                                    |   2 +
 READMEmy.md                                   |  25 +
 .../qwen3_tts/collect_env.py                  | 760 ++++++++++++++++++
 .../offline_inference/qwen3_tts/end2end.py    |  14 +-
 .../models/qwen3_tts/qwen3_tts.py             | 113 +--
 .../stage_configs/qwen3_tts.yaml              |   4 +-
 6 files changed, 865 insertions(+), 53 deletions(-)
 create mode 100644 READMEmy.md
 create mode 100644 examples/offline_inference/qwen3_tts/collect_env.py

diff --git a/.gitignore b/.gitignore
index 12486f4a7f..214c8efb42 100644
--- a/.gitignore
+++ b/.gitignore
@@ -245,3 +245,5 @@ tmp_test
 
 # output files
 *.wav
+examples/offline_inference/qwen3_tts/test.py
+examples/online_serving/qwen3_tts/Untitled.ipynb
diff --git a/READMEmy.md b/READMEmy.md
new file mode 100644
index 0000000000..31dc180d61
--- /dev/null
+++ b/READMEmy.md
@@ -0,0 +1,25 @@
+
+
+```bash
+cd /lustre/users/rkoshkin
+git clone https://github.com/vllm-project/vllm-omni.git
+cd vllm-omni
+uv venv --python 3.10 --seed
+source .venv/bin/activate
+cd ..
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+git checkout v0.16.0
+export VLLM_PRECOMPILED_WHEEL_LOCATION=https://github.com/vllm-project/vllm/releases/download/v0.16.0/vllm-0.16.0-cp38-abi3-manylinux_2_31_x86_64.whl
+uv pip install -e .
+cd ../vllm-omni
+uv pip install -e .
+```
+
+
+
+```bash
+# edit /lustre/users/rkoshkin/vllm-omni/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml AS NECESSARY
+cd examples/online_serving/qwen3_tts
+./run_server.sh Base
+```
diff --git a/examples/offline_inference/qwen3_tts/collect_env.py b/examples/offline_inference/qwen3_tts/collect_env.py
new file mode 100644
index 0000000000..8b09379e1a
--- /dev/null
+++ b/examples/offline_inference/qwen3_tts/collect_env.py
@@ -0,0 +1,760 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# ruff: noqa
+# code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py
+
+import datetime
+import locale
+import os
+import subprocess
+import sys
+
+# Unlike the rest of the PyTorch this file must be python2 compliant.
+# This script outputs relevant system environment info
+# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
+from collections import namedtuple
+
+import regex as re
+
+from vllm.envs import environment_variables
+
+try:
+    import torch
+
+    TORCH_AVAILABLE = True
+except (ImportError, NameError, AttributeError, OSError):
+    TORCH_AVAILABLE = False
+
+# System Environment Information
+SystemEnv = namedtuple(
+    "SystemEnv",
+    [
+        "torch_version",
+        "is_debug_build",
+        "cuda_compiled_version",
+        "gcc_version",
+        "clang_version",
+        "cmake_version",
+        "os",
+        "libc_version",
+        "python_version",
+        "python_platform",
+        "is_cuda_available",
+        "cuda_runtime_version",
+        "cuda_module_loading",
+        "nvidia_driver_version",
+        "nvidia_gpu_models",
+        "cudnn_version",
+        "pip_version",  # 'pip' or 'pip3'
+        "pip_packages",
+        "conda_packages",
+        "hip_compiled_version",
+        "hip_runtime_version",
+        "miopen_runtime_version",
+        "caching_allocator_config",
+        "is_xnnpack_available",
+        "cpu_info",
+        "rocm_version",  # vllm specific field
+        "vllm_version",  # vllm specific field
+        "vllm_omni_version",  # vllm-omni specific field
+        "vllm_build_flags",  # vllm specific field
+        "gpu_topo",  # vllm specific field
+        "env_vars",
+    ],
+)
+
+DEFAULT_CONDA_PATTERNS = {
+    "torch",
+    "numpy",
+    "cudatoolkit",
+    "soumith",
+    "mkl",
+    "magma",
+    "triton",
+    "optree",
+    "nccl",
+    "transformers",
+    "zmq",
+    "nvidia",
+    "pynvml",
+    "flashinfer-python",
+}
+
+DEFAULT_PIP_PATTERNS = {
+    "torch",
+    "numpy",
+    "mypy",
+    "flake8",
+    "triton",
+    "optree",
+    "onnx",
+    "nccl",
+    "transformers",
+    "zmq",
+    "nvidia",
+    "pynvml",
+    "flashinfer-python",
+}
+
+
+def run(command):
+    """Return (return-code, stdout, stderr)."""
+    shell = True if type(command) is str else False
+    try:
+        p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell)
+        raw_output, raw_err = p.communicate()
+        rc = p.returncode
+        if get_platform() == "win32":
+            enc = "oem"
+        else:
+            enc = locale.getpreferredencoding()
+        output = raw_output.decode(enc)
+        if command == "nvidia-smi topo -m":
+            # don't remove the leading whitespace of `nvidia-smi topo -m`
+            #   because they are meaningful
+            output = output.rstrip()
+        else:
+            output = output.strip()
+        err = raw_err.decode(enc)
+        return rc, output, err.strip()
+
+    except FileNotFoundError:
+        cmd_str = command if isinstance(command, str) else command[0]
+        return 127, "", f"Command not found: {cmd_str}"
+
+
+def run_and_read_all(run_lambda, command):
+    """Run command using run_lambda; reads and returns entire output if rc is 0."""
+    rc, out, _ = run_lambda(command)
+    if rc != 0:
+        return None
+    return out
+
+
+def run_and_parse_first_match(run_lambda, command, regex):
+    """Run command using run_lambda, returns the first regex match if it exists."""
+    rc, out, _ = run_lambda(command)
+    if rc != 0:
+        return None
+    match = re.search(regex, out)
+    if match is None:
+        return None
+    return match.group(1)
+
+
+def run_and_return_first_line(run_lambda, command):
+    """Run command using run_lambda and returns first line if output is not empty."""
+    rc, out, _ = run_lambda(command)
+    if rc != 0:
+        return None
+    return out.split("\n")[0]
+
+
+def get_conda_packages(run_lambda, patterns=None):
+    if patterns is None:
+        patterns = DEFAULT_CONDA_PATTERNS
+    conda = os.environ.get("CONDA_EXE", "conda")
+    out = run_and_read_all(run_lambda, [conda, "list"])
+    if out is None:
+        return out
+
+    return "\n".join(
+        line for line in out.splitlines() if not line.startswith("#") and any(name in line for name in patterns)
+    )
+
+
+def get_gcc_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, "gcc --version", r"gcc (.*)")
+
+
+def get_clang_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, "clang --version", r"clang version (.*)")
+
+
+def get_cmake_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, "cmake --version", r"cmake (.*)")
+
+
+def get_nvidia_driver_version(run_lambda):
+    if get_platform() == "darwin":
+        cmd = "kextstat | grep -i cuda"
+        return run_and_parse_first_match(run_lambda, cmd, r"com[.]nvidia[.]CUDA [(](.*?)[)]")
+    smi = get_nvidia_smi()
+    return run_and_parse_first_match(run_lambda, smi, r"Driver Version: (.*?) ")
+
+
+def get_gpu_info(run_lambda):
+    if get_platform() == "darwin" or (
+        TORCH_AVAILABLE and hasattr(torch.version, "hip") and torch.version.hip is not None
+    ):
+        if TORCH_AVAILABLE and torch.cuda.is_available():
+            if torch.version.hip is not None:
+                prop = torch.cuda.get_device_properties(0)
+                if hasattr(prop, "gcnArchName"):
+                    gcnArch = " ({})".format(prop.gcnArchName)
+                else:
+                    gcnArch = "NoGCNArchNameOnOldPyTorch"
+            else:
+                gcnArch = ""
+            return torch.cuda.get_device_name(None) + gcnArch
+        return None
+    smi = get_nvidia_smi()
+    uuid_regex = re.compile(r" \(UUID: .+?\)")
+    rc, out, _ = run_lambda(smi + " -L")
+    if rc != 0:
+        return None
+    # Anonymize GPUs by removing their UUID
+    return re.sub(uuid_regex, "", out)
+
+
+def get_running_cuda_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, "nvcc --version", r"release .+ V(.*)")
+
+
+def get_cudnn_version(run_lambda):
+    """Return a list of libcudnn.so; it's hard to tell which one is being used."""
+    if get_platform() == "win32":
+        system_root = os.environ.get("SYSTEMROOT", "C:\\Windows")
+        cuda_path = os.environ.get("CUDA_PATH", "%CUDA_PATH%")
+        where_cmd = os.path.join(system_root, "System32", "where")
+        cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path)
+    elif get_platform() == "darwin":
+        # CUDA libraries and drivers can be found in /usr/local/cuda/. See
+        # https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#install
+        # https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installmac
+        # Use CUDNN_LIBRARY when cudnn library is installed elsewhere.
+        cudnn_cmd = "ls /usr/local/cuda/lib/libcudnn*"
+    else:
+        cudnn_cmd = 'ldconfig -p | grep libcudnn | rev | cut -d" " -f1 | rev'
+    rc, out, _ = run_lambda(cudnn_cmd)
+    # find will return 1 if there are permission errors or if not found
+    if len(out) == 0 or (rc != 1 and rc != 0):
+        l = os.environ.get("CUDNN_LIBRARY")
+        if l is not None and os.path.isfile(l):
+            return os.path.realpath(l)
+        return None
+    files_set = set()
+    for fn in out.split("\n"):
+        fn = os.path.realpath(fn)  # eliminate symbolic links
+        if os.path.isfile(fn):
+            files_set.add(fn)
+    if not files_set:
+        return None
+    # Alphabetize the result because the order is non-deterministic otherwise
+    files = sorted(files_set)
+    if len(files) == 1:
+        return files[0]
+    result = "\n".join(files)
+    return "Probably one of the following:\n{}".format(result)
+
+
+def get_nvidia_smi():
+    # Note: nvidia-smi is currently available only on Windows and Linux
+    smi = "nvidia-smi"
+    if get_platform() == "win32":
+        system_root = os.environ.get("SYSTEMROOT", "C:\\Windows")
+        program_files_root = os.environ.get("PROGRAMFILES", "C:\\Program Files")
+        legacy_path = os.path.join(program_files_root, "NVIDIA Corporation", "NVSMI", smi)
+        new_path = os.path.join(system_root, "System32", smi)
+        smis = [new_path, legacy_path]
+        for candidate_smi in smis:
+            if os.path.exists(candidate_smi):
+                smi = '"{}"'.format(candidate_smi)
+                break
+    return smi
+
+
+def get_rocm_version(run_lambda):
+    """Returns the ROCm version if available, otherwise 'N/A'."""
+    return run_and_parse_first_match(run_lambda, "hipcc --version", r"HIP version: (\S+)")
+
+
+def get_vllm_version():
+    from vllm import __version__, __version_tuple__
+
+    if __version__ == "dev":
+        return "N/A (dev)"
+    version_str = __version_tuple__[-1]
+    if isinstance(version_str, str) and version_str.startswith("g"):
+        # it's a dev build
+        if "." in version_str:
+            # it's a dev build containing local changes
+            git_sha = version_str.split(".")[0][1:]
+            date = version_str.split(".")[-1][1:]
+            return f"{__version__} (git sha: {git_sha}, date: {date})"
+        else:
+            # it's a dev build without local changes
+            git_sha = version_str[1:]  # type: ignore
+            return f"{__version__} (git sha: {git_sha})"
+    return __version__
+
+
+def get_vllm_omni_version(run_lambda):
+    try:
+        import vllm_omni
+        from vllm_omni import __version__, __version_tuple__
+
+        version_str = __version_tuple__[-1]
+        if isinstance(version_str, str) and version_str.startswith("g"):
+            if "." in version_str:
+                git_sha = version_str.split(".")[0][1:]
+                date = version_str.split(".")[-1][1:]
+                return f"{__version__} (git sha: {git_sha}, date: {date})"
+            else:
+                git_sha = version_str[1:]
+                return f"{__version__} (git sha: {git_sha})"
+
+        package_dir = os.path.dirname(os.path.abspath(vllm_omni.__file__))
+        git_sha = run_and_read_all(run_lambda, f"git -C {package_dir} rev-parse --short HEAD")
+        if git_sha:
+            return f"{__version__} (git sha: {git_sha})"
+
+        return __version__
+    except ImportError:
+        return "N/A (vllm_omni not installed)"
+
+
+def summarize_vllm_build_flags():
+    # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
+    return "CUDA Archs: {}; ROCm: {}".format(
+        os.environ.get("TORCH_CUDA_ARCH_LIST", "Not Set"),
+        "Enabled" if os.environ.get("ROCM_HOME") else "Disabled",
+    )
+
+
+def get_gpu_topo(run_lambda):
+    output = None
+
+    if get_platform() == "linux":
+        output = run_and_read_all(run_lambda, "nvidia-smi topo -m")
+        if output is None:
+            output = run_and_read_all(run_lambda, "rocm-smi --showtopo")
+
+    return output
+
+
+def get_cpu_info(run_lambda):
+    rc, out, err = 0, "", ""
+    if get_platform() == "linux":
+        rc, out, err = run_lambda("lscpu")
+    elif get_platform() == "win32":
+        rc, out, err = run_lambda(
+            "wmic cpu get Name,Manufacturer,Family,Architecture,ProcessorType,DeviceID, \
+        CurrentClockSpeed,MaxClockSpeed,L2CacheSize,L2CacheSpeed,Revision /VALUE"
+        )
+    elif get_platform() == "darwin":
+        rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string")
+    cpu_info = "None"
+    if rc == 0:
+        cpu_info = out
+    else:
+        cpu_info = err
+    return cpu_info
+
+
+def get_platform():
+    if sys.platform.startswith("linux"):
+        return "linux"
+    elif sys.platform.startswith("win32"):
+        return "win32"
+    elif sys.platform.startswith("cygwin"):
+        return "cygwin"
+    elif sys.platform.startswith("darwin"):
+        return "darwin"
+    else:
+        return sys.platform
+
+
+def get_mac_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, "sw_vers -productVersion", r"(.*)")
+
+
+def get_windows_version(run_lambda):
+    system_root = os.environ.get("SYSTEMROOT", "C:\\Windows")
+    wmic_cmd = os.path.join(system_root, "System32", "Wbem", "wmic")
+    findstr_cmd = os.path.join(system_root, "System32", "findstr")
+    return run_and_read_all(run_lambda, "{} os get Caption | {} /v Caption".format(wmic_cmd, findstr_cmd))
+
+
+def get_lsb_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, "lsb_release -a", r"Description:\t(.*)")
+
+
+def check_release_file(run_lambda):
+    return run_and_parse_first_match(run_lambda, "cat /etc/*-release", r'PRETTY_NAME="(.*)"')
+
+
+def get_os(run_lambda):
+    from platform import machine
+
+    platform = get_platform()
+
+    if platform == "win32" or platform == "cygwin":
+        return get_windows_version(run_lambda)
+
+    if platform == "darwin":
+        version = get_mac_version(run_lambda)
+        if version is None:
+            return None
+        return "macOS {} ({})".format(version, machine())
+
+    if platform == "linux":
+        # Ubuntu/Debian based
+        desc = get_lsb_version(run_lambda)
+        if desc is not None:
+            return "{} ({})".format(desc, machine())
+
+        # Try reading /etc/*-release
+        desc = check_release_file(run_lambda)
+        if desc is not None:
+            return "{} ({})".format(desc, machine())
+
+        return "{} ({})".format(platform, machine())
+
+    # Unknown platform
+    return platform
+
+
+def get_python_platform():
+    import platform
+
+    return platform.platform()
+
+
+def get_libc_version():
+    import platform
+
+    if get_platform() != "linux":
+        return "N/A"
+    return "-".join(platform.libc_ver())
+
+
+def is_uv_venv():
+    if os.environ.get("UV"):
+        return True
+    pyvenv_cfg_path = os.path.join(sys.prefix, "pyvenv.cfg")
+    if os.path.exists(pyvenv_cfg_path):
+        with open(pyvenv_cfg_path, "r") as f:
+            return any(line.startswith("uv = ") for line in f)
+    return False
+
+
+def get_pip_packages(run_lambda, patterns=None):
+    """Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages."""
+    if patterns is None:
+        patterns = DEFAULT_PIP_PATTERNS
+
+    def run_with_pip():
+        try:
+            import importlib.util
+
+            pip_spec = importlib.util.find_spec("pip")
+            pip_available = pip_spec is not None
+        except ImportError:
+            pip_available = False
+
+        if pip_available:
+            cmd = [sys.executable, "-mpip", "list", "--format=freeze"]
+        elif is_uv_venv():
+            print("uv is set")
+            cmd = ["uv", "pip", "list", "--format=freeze"]
+        else:
+            raise RuntimeError("Could not collect pip list output (pip or uv module not available)")
+
+        out = run_and_read_all(run_lambda, cmd)
+        return "\n".join(line for line in out.splitlines() if any(name in line for name in patterns))
+
+    pip_version = "pip3" if sys.version[0] == "3" else "pip"
+    out = run_with_pip()
+    return pip_version, out
+
+
+def get_cachingallocator_config():
+    ca_config = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")
+    return ca_config
+
+
+def get_cuda_module_loading_config():
+    if TORCH_AVAILABLE and torch.cuda.is_available():
+        torch.cuda.init()
+        config = os.environ.get("CUDA_MODULE_LOADING", "")
+        return config
+    else:
+        return "N/A"
+
+
+def is_xnnpack_available():
+    if TORCH_AVAILABLE:
+        import torch.backends.xnnpack
+
+        return str(torch.backends.xnnpack.enabled)  # type: ignore[attr-defined]
+    else:
+        return "N/A"
+
+
+def get_env_vars():
+    env_vars = ""
+    secret_terms = ("secret", "token", "api", "access", "password")
+    report_prefix = (
+        "TORCH",
+        "NCCL",
+        "PYTORCH",
+        "CUDA",
+        "CUBLAS",
+        "CUDNN",
+        "OMP_",
+        "MKL_",
+        "NVIDIA",
+    )
+    for k, v in os.environ.items():
+        if any(term in k.lower() for term in secret_terms):
+            continue
+        if k in environment_variables:
+            env_vars = env_vars + "{}={}".format(k, v) + "\n"
+        if k.startswith(report_prefix):
+            env_vars = env_vars + "{}={}".format(k, v) + "\n"
+
+    return env_vars
+
+
+def get_env_info():
+    run_lambda = run
+    pip_version, pip_list_output = get_pip_packages(run_lambda)
+
+    if TORCH_AVAILABLE:
+        version_str = torch.__version__
+        debug_mode_str = str(torch.version.debug)
+        cuda_available_str = str(torch.cuda.is_available())
+        cuda_version_str = torch.version.cuda
+        if not hasattr(torch.version, "hip") or torch.version.hip is None:  # cuda version
+            hip_compiled_version = hip_runtime_version = miopen_runtime_version = "N/A"
+        else:  # HIP version
+
+            def get_version_or_na(cfg, prefix):
+                _lst = [s.rsplit(None, 1)[-1] for s in cfg if prefix in s]
+                return _lst[0] if _lst else "N/A"
+
+            cfg = torch._C._show_config().split("\n")
+            hip_runtime_version = get_version_or_na(cfg, "HIP Runtime")
+            miopen_runtime_version = get_version_or_na(cfg, "MIOpen")
+            cuda_version_str = "N/A"
+            hip_compiled_version = torch.version.hip
+    else:
+        version_str = debug_mode_str = cuda_available_str = cuda_version_str = "N/A"
+        hip_compiled_version = hip_runtime_version = miopen_runtime_version = "N/A"
+
+    sys_version = sys.version.replace("\n", " ")
+
+    conda_packages = get_conda_packages(run_lambda)
+
+    rocm_version = get_rocm_version(run_lambda)
+    vllm_version = get_vllm_version()
+    vllm_omni_version = get_vllm_omni_version(run_lambda)
+    vllm_build_flags = summarize_vllm_build_flags()
+    gpu_topo = get_gpu_topo(run_lambda)
+
+    return SystemEnv(
+        torch_version=version_str,
+        is_debug_build=debug_mode_str,
+        python_version="{} ({}-bit runtime)".format(sys_version, sys.maxsize.bit_length() + 1),
+        python_platform=get_python_platform(),
+        is_cuda_available=cuda_available_str,
+        cuda_compiled_version=cuda_version_str,
+        cuda_runtime_version=get_running_cuda_version(run_lambda),
+        cuda_module_loading=get_cuda_module_loading_config(),
+        nvidia_gpu_models=get_gpu_info(run_lambda),
+        nvidia_driver_version=get_nvidia_driver_version(run_lambda),
+        cudnn_version=get_cudnn_version(run_lambda),
+        hip_compiled_version=hip_compiled_version,
+        hip_runtime_version=hip_runtime_version,
+        miopen_runtime_version=miopen_runtime_version,
+        pip_version=pip_version,
+        pip_packages=pip_list_output,
+        conda_packages=conda_packages,
+        os=get_os(run_lambda),
+        libc_version=get_libc_version(),
+        gcc_version=get_gcc_version(run_lambda),
+        clang_version=get_clang_version(run_lambda),
+        cmake_version=get_cmake_version(run_lambda),
+        caching_allocator_config=get_cachingallocator_config(),
+        is_xnnpack_available=is_xnnpack_available(),
+        cpu_info=get_cpu_info(run_lambda),
+        rocm_version=rocm_version,
+        vllm_version=vllm_version,
+        vllm_omni_version=vllm_omni_version,
+        vllm_build_flags=vllm_build_flags,
+        gpu_topo=gpu_topo,
+        env_vars=get_env_vars(),
+    )
+
+
+env_info_fmt = """
+==============================
+        System Info
+==============================
+OS                           : {os}
+GCC version                  : {gcc_version}
+Clang version                : {clang_version}
+CMake version                : {cmake_version}
+Libc version                 : {libc_version}
+
+==============================
+       PyTorch Info
+==============================
+PyTorch version              : {torch_version}
+Is debug build               : {is_debug_build}
+CUDA used to build PyTorch   : {cuda_compiled_version}
+ROCM used to build PyTorch   : {hip_compiled_version}
+
+==============================
+      Python Environment
+==============================
+Python version               : {python_version}
+Python platform              : {python_platform}
+
+==============================
+       CUDA / GPU Info
+==============================
+Is CUDA available            : {is_cuda_available}
+CUDA runtime version         : {cuda_runtime_version}
+CUDA_MODULE_LOADING set to   : {cuda_module_loading}
+GPU models and configuration : {nvidia_gpu_models}
+Nvidia driver version        : {nvidia_driver_version}
+cuDNN version                : {cudnn_version}
+HIP runtime version          : {hip_runtime_version}
+MIOpen runtime version       : {miopen_runtime_version}
+Is XNNPACK available         : {is_xnnpack_available}
+
+==============================
+          CPU Info
+==============================
+{cpu_info}
+
+==============================
+Versions of relevant libraries
+==============================
+{pip_packages}
+{conda_packages}
+""".strip()
+
+# both the above code and the following code use `strip()` to
+# remove leading/trailing whitespaces, so we need to add a newline
+# in between to separate the two sections
+env_info_fmt += "\n\n"
+
+env_info_fmt += """
+==============================
+         vLLM Info
+==============================
+ROCM Version                 : {rocm_version}
+vLLM Version                 : {vllm_version}
+vLLM-Omni Version            : {vllm_omni_version}
+vLLM Build Flags:
+  {vllm_build_flags}
+GPU Topology:
+  {gpu_topo}
+
+==============================
+     Environment Variables
+==============================
+{env_vars}
+""".strip()
+
+
+def pretty_str(envinfo):
+    def replace_nones(dct, replacement="Could not collect"):
+        for key in dct.keys():
+            if dct[key] is not None:
+                continue
+            dct[key] = replacement
+        return dct
+
+    def replace_bools(dct, true="Yes", false="No"):
+        for key in dct.keys():
+            if dct[key] is True:
+                dct[key] = true
+            elif dct[key] is False:
+                dct[key] = false
+        return dct
+
+    def prepend(text, tag="[prepend]"):
+        lines = text.split("\n")
+        updated_lines = [tag + line for line in lines]
+        return "\n".join(updated_lines)
+
+    def replace_if_empty(text, replacement="No relevant packages"):
+        if text is not None and len(text) == 0:
+            return replacement
+        return text
+
+    def maybe_start_on_next_line(string):
+        # If `string` is multiline, prepend a \n to it.
+        if string is not None and len(string.split("\n")) > 1:
+            return "\n{}\n".format(string)
+        return string
+
+    mutable_dict = envinfo._asdict()
+
+    # If nvidia_gpu_models is multiline, start on the next line
+    mutable_dict["nvidia_gpu_models"] = maybe_start_on_next_line(envinfo.nvidia_gpu_models)
+
+    # If the machine doesn't have CUDA, report some fields as 'No CUDA'
+    dynamic_cuda_fields = [
+        "cuda_runtime_version",
+        "nvidia_gpu_models",
+        "nvidia_driver_version",
+    ]
+    all_cuda_fields = dynamic_cuda_fields + ["cudnn_version"]
+    all_dynamic_cuda_fields_missing = all(mutable_dict[field] is None for field in dynamic_cuda_fields)
+    if TORCH_AVAILABLE and not torch.cuda.is_available() and all_dynamic_cuda_fields_missing:
+        for field in all_cuda_fields:
+            mutable_dict[field] = "No CUDA"
+        if envinfo.cuda_compiled_version is None:
+            mutable_dict["cuda_compiled_version"] = "None"
+
+    # Replace True with Yes, False with No
+    mutable_dict = replace_bools(mutable_dict)
+
+    # Replace all None objects with 'Could not collect'
+    mutable_dict = replace_nones(mutable_dict)
+
+    # If either of these are '', replace with 'No relevant packages'
+    mutable_dict["pip_packages"] = replace_if_empty(mutable_dict["pip_packages"])
+    mutable_dict["conda_packages"] = replace_if_empty(mutable_dict["conda_packages"])
+
+    # Tag conda and pip packages with a prefix
+    # If they were previously None, they'll show up as ie '[conda] Could not collect'
+    if mutable_dict["pip_packages"]:
+        mutable_dict["pip_packages"] = prepend(mutable_dict["pip_packages"], "[{}] ".format(envinfo.pip_version))
+    if mutable_dict["conda_packages"]:
+        mutable_dict["conda_packages"] = prepend(mutable_dict["conda_packages"], "[conda] ")
+    mutable_dict["cpu_info"] = envinfo.cpu_info
+    return env_info_fmt.format(**mutable_dict)
+
+
+def get_pretty_env_info():
+    return pretty_str(get_env_info())
+
+
+def main():
+    print("Collecting environment information...")
+    output = get_pretty_env_info()
+    print(output)
+
+    if TORCH_AVAILABLE and hasattr(torch, "utils") and hasattr(torch.utils, "_crash_handler"):
+        minidump_dir = torch.utils._crash_handler.DEFAULT_MINIDUMP_DIR
+        if sys.platform == "linux" and os.path.exists(minidump_dir):
+            dumps = [os.path.join(minidump_dir, dump) for dump in os.listdir(minidump_dir)]
+            latest = max(dumps, key=os.path.getctime)
+            ctime = os.path.getctime(latest)
+            creation_time = datetime.datetime.fromtimestamp(ctime).strftime("%Y-%m-%d %H:%M:%S")
+            msg = (
+                "\n*** Detected a minidump at {} created on {}, ".format(latest, creation_time)
+                + "if this is related to your bug please include it when you file a report ***"
+            )
+            print(msg, file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/qwen3_tts/end2end.py b/examples/offline_inference/qwen3_tts/end2end.py
index 93aeba3ca5..ddb57869f9 100644
--- a/examples/offline_inference/qwen3_tts/end2end.py
+++ b/examples/offline_inference/qwen3_tts/end2end.py
@@ -35,10 +35,16 @@ def get_custom_voice_query(use_batch_sample: bool = False) -> QueryResult:
     """
     task_type = "CustomVoice"
     if use_batch_sample:
-        texts = ["其实我真的有发现，我是一个特别善于观察别人情绪的人。", "She said she would be here by noon."]
-        instructs = ["", "Very happy."]
-        languages = ["Chinese", "English"]
-        speakers = ["Vivian", "Ryan"]
+        texts = [
+            "其实我真的有发现，我是一个特别善于观察别人情绪的人。",
+            "She said she would be here by noon.",
+            "I like you very much.",
+            "Really, you do?",
+            "Yes, absolutely.",
+        ]
+        instructs = ["", "Very happy.", "Very happy.", "Very happy.", "Very happy."]
+        languages = ["Chinese", "English", "English", "English", "English"]
+        speakers = ["Vivian", "Ryan", "Ryan", "Ryan", "Ryan"]
         inputs = []
         for text, instruct, language, speaker in zip(texts, instructs, languages, speakers):
             prompt = f"<|im_start|>assistant\n{text}<|im_end|>\n<|im_start|>assistant\n"
diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py
index 664d2a2957..5dfcdc542f 100644
--- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py
+++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py
@@ -97,53 +97,69 @@ def forward(
         **kwargs: Any,
     ) -> OmniOutput:
         """
-        Forward pass for TTS generation model.
-
-        Args:
-            input_ids: Input token IDs (required for TTS generation)
-            positions: Position IDs (not used for TTS, but required by runner)
-            intermediate_tensors: Intermediate tensors for pipeline parallelism (not used)
-            inputs_embeds: Input embeddings (not used for TTS, but required by runner)
-            **kwargs: Additional arguments including task_type, sampling_metadata, etc.
-
-        Returns:
-            OmniOutput: Contains multimodal outputs with audio tensors
+        Forward pass for TTS generation model (Patched for batched inference).
         """
-
-        # Extract additional parameters from kwargs that the generation methods expect
-
-        runtime_additional_information = kwargs.get("runtime_additional_information", [{}])
-        if isinstance(runtime_additional_information, list) and len(runtime_additional_information) > 0:
-            runtime_additional_information = runtime_additional_information[0]
-        text = runtime_additional_information.pop("text", [""])[0]
-        # Extract task_type from kwargs, default to "instruct"
-        task_type = runtime_additional_information.pop("task_type", [self.task_type])[0]
-        speaker = runtime_additional_information.pop("speaker", ["uncle_fu"])[0]
-        language = runtime_additional_information.pop("language", ["Auto"])[0]
-        instruct = runtime_additional_information.pop("instruct", [""])[0]
-        for key, value in runtime_additional_information.items():
-            if isinstance(value, list) and len(value) > 0:
-                runtime_additional_information[key] = value[0]
-
-        # During profile/warmup runs, text is empty and no real inputs exist.
-        # Cap generation steps so the full pipeline executes (preserving
-        # KV-cache profiling behaviour) but exits quickly even if the model
-        # cannot converge from degenerate dummy inputs.
-        if not text:
+        runtime_info_list = kwargs.get("runtime_additional_information", [{}])
+        if not isinstance(runtime_info_list, list):
+            runtime_info_list = [runtime_info_list]
+
+        # Initialize lists to accumulate batched inputs
+        texts = []
+        task_types = []
+        speakers = []
+        languages = []
+        instructs = []
+        merged_kwargs = {}
+
+        # Keys that the underlying model natively supports as lists for batched inference
+        batched_keys = {"ref_audio", "ref_text", "x_vector_only_mode", "voice_clone_prompt"}
+
+        for req_info in runtime_info_list:
+
+            def extract_val(d, key, default):
+                val = d.get(key, default)
+                if isinstance(val, list):
+                    return val[0] if len(val) > 0 else default
+                return val
+
+            texts.append(extract_val(req_info, "text", ""))
+            task_types.append(extract_val(req_info, "task_type", self.task_type))
+            speakers.append(extract_val(req_info, "speaker", "uncle_fu"))
+            languages.append(extract_val(req_info, "language", "Auto"))
+            instructs.append(extract_val(req_info, "instruct", ""))
+
+            for k, v in req_info.items():
+                if k not in ["text", "task_type", "speaker", "language", "instruct"]:
+                    # Extract single value from list if wrapped
+                    val = v[0] if isinstance(v, list) and len(v) > 0 else v
+
+                    if k in batched_keys:
+                        # Accumulate as list for batched generation
+                        if k not in merged_kwargs:
+                            merged_kwargs[k] = []
+                        merged_kwargs[k].append(val)
+                    else:
+                        # For scalar params (e.g. max_new_tokens), take from the first request
+                        if k not in merged_kwargs:
+                            merged_kwargs[k] = val
+
+        # During profile/warmup runs, texts are empty.
+        if all(not t for t in texts):
             logger.info("Profile run detected (empty text). Capping max_new_tokens to 2.")
-            runtime_additional_information["max_new_tokens"] = 2
+            merged_kwargs["max_new_tokens"] = 2
+
+        # Assume uniform task type across the batch
+        task_type = task_types[0]
 
-        # Call the appropriate generation method based on task_type
+        # Call the appropriate generation method based on task_type, passing lists
         if task_type == "CustomVoice":
             result = self.model.generate_custom_voice(
-                text, speaker=speaker, language=language, instruct=instruct, **runtime_additional_information
+                texts, speaker=speakers, language=languages, instruct=instructs, **merged_kwargs
             )
         elif task_type == "VoiceDesign":
-            result = self.model.generate_voice_design(
-                text, instruct=instruct, language=language, **runtime_additional_information
-            )
+            result = self.model.generate_voice_design(texts, instruct=instructs, language=languages, **merged_kwargs)
         elif task_type == "Base":
-            result = self.model.generate_voice_clone(text, language=language, **runtime_additional_information)
+            result = self.model.generate_voice_clone(texts, language=languages, **merged_kwargs)
         else:
             raise ValueError(f"Invalid task type: {task_type}")
 
@@ -162,17 +178,20 @@ def make_omni_output(self, model_outputs: torch.Tensor | OmniOutput | tuple, **k
         # Handle tuple format: (audio_tensors, sample_rate)
         if isinstance(model_outputs, tuple) and len(model_outputs) == 2:
             audio_tensors, sr = model_outputs
-            # audio_tensors is a list of numpy arrays, convert first one to tensor if needed
+            # audio_tensors is a list of numpy arrays, convert ALL to tensors
             if isinstance(audio_tensors, list) and len(audio_tensors) > 0:
-                # Convert numpy array to tensor if needed
-                audio_tensor = audio_tensors[0]
-                if isinstance(audio_tensor, np.ndarray):
-                    audio_tensor = torch.from_numpy(audio_tensor).float()
-                elif not isinstance(audio_tensor, torch.Tensor):
-                    audio_tensor = torch.tensor(audio_tensor, dtype=torch.float32)
+                audio_tensor_list = []
+                for audio_tensor in audio_tensors:
+                    if isinstance(audio_tensor, np.ndarray):
+                        audio_tensor_list.append(torch.from_numpy(audio_tensor).float())
+                    elif not isinstance(audio_tensor, torch.Tensor):
+                        audio_tensor_list.append(torch.tensor(audio_tensor, dtype=torch.float32))
+                    else:
+                        audio_tensor_list.append(audio_tensor)
+
                 return OmniOutput(
                     text_hidden_states=None,
-                    multimodal_outputs={"model_outputs": audio_tensor, "sr": torch.tensor(sr, dtype=torch.int)},
+                    multimodal_outputs={"model_outputs": audio_tensor_list, "sr": torch.tensor(sr, dtype=torch.int)},
                 )
 
         # If it's already a tensor, wrap it
diff --git a/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml b/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml
index d408dbab91..09cf2a4ccc 100644
--- a/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml
+++ b/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml
@@ -3,7 +3,7 @@ stage_args:
     stage_type: llm  # Use llm stage type to launch OmniLLM
     runtime:
       devices: "0"
-      max_batch_size: 1
+      max_batch_size: 100
     engine_args:
       model_stage: qwen3_tts
       model_arch: Qwen3TTSForConditionalGeneration
@@ -14,7 +14,7 @@ stage_args:
       async_scheduling: false
       enable_prefix_caching: false
       engine_output_type: audio  # Final output: audio waveform
-      gpu_memory_utilization: 0.1
+      gpu_memory_utilization: 0.8
       distributed_executor_backend: "mp"
       max_num_batched_tokens: 1000000
 

From 732453e1a50a86fdbea04fcec50c601b1ea84826 Mon Sep 17 00:00:00 2001
From: Roman Koshkin <roman.koshkin@sbintuitions.co.jp>
Date: Sat, 21 Feb 2026 15:38:31 +0900
Subject: [PATCH 2/4] [Bugfix] [cleanup](qwen3_tts): enable batched offline
 inference by fixing tensor slicing

Signed-off-by: Roman Koshkin <roman.koshkin@sbintuitions.co.jp>
---
 READMEmy.md                                   |  25 -
 .../qwen3_tts/collect_env.py                  | 760 ------------------
 .../models/qwen3_tts/qwen3_tts.py             |  24 +-
 .../stage_configs/qwen3_tts.yaml              |   4 +-
 4 files changed, 14 insertions(+), 799 deletions(-)
 delete mode 100644 READMEmy.md
 delete mode 100644 examples/offline_inference/qwen3_tts/collect_env.py

diff --git a/READMEmy.md b/READMEmy.md
deleted file mode 100644
index 31dc180d61..0000000000
--- a/READMEmy.md
+++ /dev/null
@@ -1,25 +0,0 @@
-
-
-```bash
-cd /lustre/users/rkoshkin
-git clone https://github.com/vllm-project/vllm-omni.git
-cd vllm-omni
-uv venv --python 3.10 --seed
-source .venv/bin/activate
-cd ..
-git clone https://github.com/vllm-project/vllm.git
-cd vllm
-git checkout v0.16.0
-export VLLM_PRECOMPILED_WHEEL_LOCATION=https://github.com/vllm-project/vllm/releases/download/v0.16.0/vllm-0.16.0-cp38-abi3-manylinux_2_31_x86_64.whl
-uv pip install -e .
-cd ../vllm-omni
-uv pip install -e .
-```
-
-
-
-```bash
-# edit /lustre/users/rkoshkin/vllm-omni/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml AS NECESSARY
-cd examples/online_serving/qwen3_tts
-./run_server.sh Base
-```
diff --git a/examples/offline_inference/qwen3_tts/collect_env.py b/examples/offline_inference/qwen3_tts/collect_env.py
deleted file mode 100644
index 8b09379e1a..0000000000
--- a/examples/offline_inference/qwen3_tts/collect_env.py
+++ /dev/null
@@ -1,760 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# ruff: noqa
-# code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py
-
-import datetime
-import locale
-import os
-import subprocess
-import sys
-
-# Unlike the rest of the PyTorch this file must be python2 compliant.
-# This script outputs relevant system environment info
-# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
-from collections import namedtuple
-
-import regex as re
-
-from vllm.envs import environment_variables
-
-try:
-    import torch
-
-    TORCH_AVAILABLE = True
-except (ImportError, NameError, AttributeError, OSError):
-    TORCH_AVAILABLE = False
-
-# System Environment Information
-SystemEnv = namedtuple(
-    "SystemEnv",
-    [
-        "torch_version",
-        "is_debug_build",
-        "cuda_compiled_version",
-        "gcc_version",
-        "clang_version",
-        "cmake_version",
-        "os",
-        "libc_version",
-        "python_version",
-        "python_platform",
-        "is_cuda_available",
-        "cuda_runtime_version",
-        "cuda_module_loading",
-        "nvidia_driver_version",
-        "nvidia_gpu_models",
-        "cudnn_version",
-        "pip_version",  # 'pip' or 'pip3'
-        "pip_packages",
-        "conda_packages",
-        "hip_compiled_version",
-        "hip_runtime_version",
-        "miopen_runtime_version",
-        "caching_allocator_config",
-        "is_xnnpack_available",
-        "cpu_info",
-        "rocm_version",  # vllm specific field
-        "vllm_version",  # vllm specific field
-        "vllm_omni_version",  # vllm-omni specific field
-        "vllm_build_flags",  # vllm specific field
-        "gpu_topo",  # vllm specific field
-        "env_vars",
-    ],
-)
-
-DEFAULT_CONDA_PATTERNS = {
-    "torch",
-    "numpy",
-    "cudatoolkit",
-    "soumith",
-    "mkl",
-    "magma",
-    "triton",
-    "optree",
-    "nccl",
-    "transformers",
-    "zmq",
-    "nvidia",
-    "pynvml",
-    "flashinfer-python",
-}
-
-DEFAULT_PIP_PATTERNS = {
-    "torch",
-    "numpy",
-    "mypy",
-    "flake8",
-    "triton",
-    "optree",
-    "onnx",
-    "nccl",
-    "transformers",
-    "zmq",
-    "nvidia",
-    "pynvml",
-    "flashinfer-python",
-}
-
-
-def run(command):
-    """Return (return-code, stdout, stderr)."""
-    shell = True if type(command) is str else False
-    try:
-        p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell)
-        raw_output, raw_err = p.communicate()
-        rc = p.returncode
-        if get_platform() == "win32":
-            enc = "oem"
-        else:
-            enc = locale.getpreferredencoding()
-        output = raw_output.decode(enc)
-        if command == "nvidia-smi topo -m":
-            # don't remove the leading whitespace of `nvidia-smi topo -m`
-            #   because they are meaningful
-            output = output.rstrip()
-        else:
-            output = output.strip()
-        err = raw_err.decode(enc)
-        return rc, output, err.strip()
-
-    except FileNotFoundError:
-        cmd_str = command if isinstance(command, str) else command[0]
-        return 127, "", f"Command not found: {cmd_str}"
-
-
-def run_and_read_all(run_lambda, command):
-    """Run command using run_lambda; reads and returns entire output if rc is 0."""
-    rc, out, _ = run_lambda(command)
-    if rc != 0:
-        return None
-    return out
-
-
-def run_and_parse_first_match(run_lambda, command, regex):
-    """Run command using run_lambda, returns the first regex match if it exists."""
-    rc, out, _ = run_lambda(command)
-    if rc != 0:
-        return None
-    match = re.search(regex, out)
-    if match is None:
-        return None
-    return match.group(1)
-
-
-def run_and_return_first_line(run_lambda, command):
-    """Run command using run_lambda and returns first line if output is not empty."""
-    rc, out, _ = run_lambda(command)
-    if rc != 0:
-        return None
-    return out.split("\n")[0]
-
-
-def get_conda_packages(run_lambda, patterns=None):
-    if patterns is None:
-        patterns = DEFAULT_CONDA_PATTERNS
-    conda = os.environ.get("CONDA_EXE", "conda")
-    out = run_and_read_all(run_lambda, [conda, "list"])
-    if out is None:
-        return out
-
-    return "\n".join(
-        line for line in out.splitlines() if not line.startswith("#") and any(name in line for name in patterns)
-    )
-
-
-def get_gcc_version(run_lambda):
-    return run_and_parse_first_match(run_lambda, "gcc --version", r"gcc (.*)")
-
-
-def get_clang_version(run_lambda):
-    return run_and_parse_first_match(run_lambda, "clang --version", r"clang version (.*)")
-
-
-def get_cmake_version(run_lambda):
-    return run_and_parse_first_match(run_lambda, "cmake --version", r"cmake (.*)")
-
-
-def get_nvidia_driver_version(run_lambda):
-    if get_platform() == "darwin":
-        cmd = "kextstat | grep -i cuda"
-        return run_and_parse_first_match(run_lambda, cmd, r"com[.]nvidia[.]CUDA [(](.*?)[)]")
-    smi = get_nvidia_smi()
-    return run_and_parse_first_match(run_lambda, smi, r"Driver Version: (.*?) ")
-
-
-def get_gpu_info(run_lambda):
-    if get_platform() == "darwin" or (
-        TORCH_AVAILABLE and hasattr(torch.version, "hip") and torch.version.hip is not None
-    ):
-        if TORCH_AVAILABLE and torch.cuda.is_available():
-            if torch.version.hip is not None:
-                prop = torch.cuda.get_device_properties(0)
-                if hasattr(prop, "gcnArchName"):
-                    gcnArch = " ({})".format(prop.gcnArchName)
-                else:
-                    gcnArch = "NoGCNArchNameOnOldPyTorch"
-            else:
-                gcnArch = ""
-            return torch.cuda.get_device_name(None) + gcnArch
-        return None
-    smi = get_nvidia_smi()
-    uuid_regex = re.compile(r" \(UUID: .+?\)")
-    rc, out, _ = run_lambda(smi + " -L")
-    if rc != 0:
-        return None
-    # Anonymize GPUs by removing their UUID
-    return re.sub(uuid_regex, "", out)
-
-
-def get_running_cuda_version(run_lambda):
-    return run_and_parse_first_match(run_lambda, "nvcc --version", r"release .+ V(.*)")
-
-
-def get_cudnn_version(run_lambda):
-    """Return a list of libcudnn.so; it's hard to tell which one is being used."""
-    if get_platform() == "win32":
-        system_root = os.environ.get("SYSTEMROOT", "C:\\Windows")
-        cuda_path = os.environ.get("CUDA_PATH", "%CUDA_PATH%")
-        where_cmd = os.path.join(system_root, "System32", "where")
-        cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path)
-    elif get_platform() == "darwin":
-        # CUDA libraries and drivers can be found in /usr/local/cuda/. See
-        # https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#install
-        # https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installmac
-        # Use CUDNN_LIBRARY when cudnn library is installed elsewhere.
-        cudnn_cmd = "ls /usr/local/cuda/lib/libcudnn*"
-    else:
-        cudnn_cmd = 'ldconfig -p | grep libcudnn | rev | cut -d" " -f1 | rev'
-    rc, out, _ = run_lambda(cudnn_cmd)
-    # find will return 1 if there are permission errors or if not found
-    if len(out) == 0 or (rc != 1 and rc != 0):
-        l = os.environ.get("CUDNN_LIBRARY")
-        if l is not None and os.path.isfile(l):
-            return os.path.realpath(l)
-        return None
-    files_set = set()
-    for fn in out.split("\n"):
-        fn = os.path.realpath(fn)  # eliminate symbolic links
-        if os.path.isfile(fn):
-            files_set.add(fn)
-    if not files_set:
-        return None
-    # Alphabetize the result because the order is non-deterministic otherwise
-    files = sorted(files_set)
-    if len(files) == 1:
-        return files[0]
-    result = "\n".join(files)
-    return "Probably one of the following:\n{}".format(result)
-
-
-def get_nvidia_smi():
-    # Note: nvidia-smi is currently available only on Windows and Linux
-    smi = "nvidia-smi"
-    if get_platform() == "win32":
-        system_root = os.environ.get("SYSTEMROOT", "C:\\Windows")
-        program_files_root = os.environ.get("PROGRAMFILES", "C:\\Program Files")
-        legacy_path = os.path.join(program_files_root, "NVIDIA Corporation", "NVSMI", smi)
-        new_path = os.path.join(system_root, "System32", smi)
-        smis = [new_path, legacy_path]
-        for candidate_smi in smis:
-            if os.path.exists(candidate_smi):
-                smi = '"{}"'.format(candidate_smi)
-                break
-    return smi
-
-
-def get_rocm_version(run_lambda):
-    """Returns the ROCm version if available, otherwise 'N/A'."""
-    return run_and_parse_first_match(run_lambda, "hipcc --version", r"HIP version: (\S+)")
-
-
-def get_vllm_version():
-    from vllm import __version__, __version_tuple__
-
-    if __version__ == "dev":
-        return "N/A (dev)"
-    version_str = __version_tuple__[-1]
-    if isinstance(version_str, str) and version_str.startswith("g"):
-        # it's a dev build
-        if "." in version_str:
-            # it's a dev build containing local changes
-            git_sha = version_str.split(".")[0][1:]
-            date = version_str.split(".")[-1][1:]
-            return f"{__version__} (git sha: {git_sha}, date: {date})"
-        else:
-            # it's a dev build without local changes
-            git_sha = version_str[1:]  # type: ignore
-            return f"{__version__} (git sha: {git_sha})"
-    return __version__
-
-
-def get_vllm_omni_version(run_lambda):
-    try:
-        import vllm_omni
-        from vllm_omni import __version__, __version_tuple__
-
-        version_str = __version_tuple__[-1]
-        if isinstance(version_str, str) and version_str.startswith("g"):
-            if "." in version_str:
-                git_sha = version_str.split(".")[0][1:]
-                date = version_str.split(".")[-1][1:]
-                return f"{__version__} (git sha: {git_sha}, date: {date})"
-            else:
-                git_sha = version_str[1:]
-                return f"{__version__} (git sha: {git_sha})"
-
-        package_dir = os.path.dirname(os.path.abspath(vllm_omni.__file__))
-        git_sha = run_and_read_all(run_lambda, f"git -C {package_dir} rev-parse --short HEAD")
-        if git_sha:
-            return f"{__version__} (git sha: {git_sha})"
-
-        return __version__
-    except ImportError:
-        return "N/A (vllm_omni not installed)"
-
-
-def summarize_vllm_build_flags():
-    # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
-    return "CUDA Archs: {}; ROCm: {}".format(
-        os.environ.get("TORCH_CUDA_ARCH_LIST", "Not Set"),
-        "Enabled" if os.environ.get("ROCM_HOME") else "Disabled",
-    )
-
-
-def get_gpu_topo(run_lambda):
-    output = None
-
-    if get_platform() == "linux":
-        output = run_and_read_all(run_lambda, "nvidia-smi topo -m")
-        if output is None:
-            output = run_and_read_all(run_lambda, "rocm-smi --showtopo")
-
-    return output
-
-
-def get_cpu_info(run_lambda):
-    rc, out, err = 0, "", ""
-    if get_platform() == "linux":
-        rc, out, err = run_lambda("lscpu")
-    elif get_platform() == "win32":
-        rc, out, err = run_lambda(
-            "wmic cpu get Name,Manufacturer,Family,Architecture,ProcessorType,DeviceID, \
-        CurrentClockSpeed,MaxClockSpeed,L2CacheSize,L2CacheSpeed,Revision /VALUE"
-        )
-    elif get_platform() == "darwin":
-        rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string")
-    cpu_info = "None"
-    if rc == 0:
-        cpu_info = out
-    else:
-        cpu_info = err
-    return cpu_info
-
-
-def get_platform():
-    if sys.platform.startswith("linux"):
-        return "linux"
-    elif sys.platform.startswith("win32"):
-        return "win32"
-    elif sys.platform.startswith("cygwin"):
-        return "cygwin"
-    elif sys.platform.startswith("darwin"):
-        return "darwin"
-    else:
-        return sys.platform
-
-
-def get_mac_version(run_lambda):
-    return run_and_parse_first_match(run_lambda, "sw_vers -productVersion", r"(.*)")
-
-
-def get_windows_version(run_lambda):
-    system_root = os.environ.get("SYSTEMROOT", "C:\\Windows")
-    wmic_cmd = os.path.join(system_root, "System32", "Wbem", "wmic")
-    findstr_cmd = os.path.join(system_root, "System32", "findstr")
-    return run_and_read_all(run_lambda, "{} os get Caption | {} /v Caption".format(wmic_cmd, findstr_cmd))
-
-
-def get_lsb_version(run_lambda):
-    return run_and_parse_first_match(run_lambda, "lsb_release -a", r"Description:\t(.*)")
-
-
-def check_release_file(run_lambda):
-    return run_and_parse_first_match(run_lambda, "cat /etc/*-release", r'PRETTY_NAME="(.*)"')
-
-
-def get_os(run_lambda):
-    from platform import machine
-
-    platform = get_platform()
-
-    if platform == "win32" or platform == "cygwin":
-        return get_windows_version(run_lambda)
-
-    if platform == "darwin":
-        version = get_mac_version(run_lambda)
-        if version is None:
-            return None
-        return "macOS {} ({})".format(version, machine())
-
-    if platform == "linux":
-        # Ubuntu/Debian based
-        desc = get_lsb_version(run_lambda)
-        if desc is not None:
-            return "{} ({})".format(desc, machine())
-
-        # Try reading /etc/*-release
-        desc = check_release_file(run_lambda)
-        if desc is not None:
-            return "{} ({})".format(desc, machine())
-
-        return "{} ({})".format(platform, machine())
-
-    # Unknown platform
-    return platform
-
-
-def get_python_platform():
-    import platform
-
-    return platform.platform()
-
-
-def get_libc_version():
-    import platform
-
-    if get_platform() != "linux":
-        return "N/A"
-    return "-".join(platform.libc_ver())
-
-
-def is_uv_venv():
-    if os.environ.get("UV"):
-        return True
-    pyvenv_cfg_path = os.path.join(sys.prefix, "pyvenv.cfg")
-    if os.path.exists(pyvenv_cfg_path):
-        with open(pyvenv_cfg_path, "r") as f:
-            return any(line.startswith("uv = ") for line in f)
-    return False
-
-
-def get_pip_packages(run_lambda, patterns=None):
-    """Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages."""
-    if patterns is None:
-        patterns = DEFAULT_PIP_PATTERNS
-
-    def run_with_pip():
-        try:
-            import importlib.util
-
-            pip_spec = importlib.util.find_spec("pip")
-            pip_available = pip_spec is not None
-        except ImportError:
-            pip_available = False
-
-        if pip_available:
-            cmd = [sys.executable, "-mpip", "list", "--format=freeze"]
-        elif is_uv_venv():
-            print("uv is set")
-            cmd = ["uv", "pip", "list", "--format=freeze"]
-        else:
-            raise RuntimeError("Could not collect pip list output (pip or uv module not available)")
-
-        out = run_and_read_all(run_lambda, cmd)
-        return "\n".join(line for line in out.splitlines() if any(name in line for name in patterns))
-
-    pip_version = "pip3" if sys.version[0] == "3" else "pip"
-    out = run_with_pip()
-    return pip_version, out
-
-
-def get_cachingallocator_config():
-    ca_config = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")
-    return ca_config
-
-
-def get_cuda_module_loading_config():
-    if TORCH_AVAILABLE and torch.cuda.is_available():
-        torch.cuda.init()
-        config = os.environ.get("CUDA_MODULE_LOADING", "")
-        return config
-    else:
-        return "N/A"
-
-
-def is_xnnpack_available():
-    if TORCH_AVAILABLE:
-        import torch.backends.xnnpack
-
-        return str(torch.backends.xnnpack.enabled)  # type: ignore[attr-defined]
-    else:
-        return "N/A"
-
-
-def get_env_vars():
-    env_vars = ""
-    secret_terms = ("secret", "token", "api", "access", "password")
-    report_prefix = (
-        "TORCH",
-        "NCCL",
-        "PYTORCH",
-        "CUDA",
-        "CUBLAS",
-        "CUDNN",
-        "OMP_",
-        "MKL_",
-        "NVIDIA",
-    )
-    for k, v in os.environ.items():
-        if any(term in k.lower() for term in secret_terms):
-            continue
-        if k in environment_variables:
-            env_vars = env_vars + "{}={}".format(k, v) + "\n"
-        if k.startswith(report_prefix):
-            env_vars = env_vars + "{}={}".format(k, v) + "\n"
-
-    return env_vars
-
-
-def get_env_info():
-    run_lambda = run
-    pip_version, pip_list_output = get_pip_packages(run_lambda)
-
-    if TORCH_AVAILABLE:
-        version_str = torch.__version__
-        debug_mode_str = str(torch.version.debug)
-        cuda_available_str = str(torch.cuda.is_available())
-        cuda_version_str = torch.version.cuda
-        if not hasattr(torch.version, "hip") or torch.version.hip is None:  # cuda version
-            hip_compiled_version = hip_runtime_version = miopen_runtime_version = "N/A"
-        else:  # HIP version
-
-            def get_version_or_na(cfg, prefix):
-                _lst = [s.rsplit(None, 1)[-1] for s in cfg if prefix in s]
-                return _lst[0] if _lst else "N/A"
-
-            cfg = torch._C._show_config().split("\n")
-            hip_runtime_version = get_version_or_na(cfg, "HIP Runtime")
-            miopen_runtime_version = get_version_or_na(cfg, "MIOpen")
-            cuda_version_str = "N/A"
-            hip_compiled_version = torch.version.hip
-    else:
-        version_str = debug_mode_str = cuda_available_str = cuda_version_str = "N/A"
-        hip_compiled_version = hip_runtime_version = miopen_runtime_version = "N/A"
-
-    sys_version = sys.version.replace("\n", " ")
-
-    conda_packages = get_conda_packages(run_lambda)
-
-    rocm_version = get_rocm_version(run_lambda)
-    vllm_version = get_vllm_version()
-    vllm_omni_version = get_vllm_omni_version(run_lambda)
-    vllm_build_flags = summarize_vllm_build_flags()
-    gpu_topo = get_gpu_topo(run_lambda)
-
-    return SystemEnv(
-        torch_version=version_str,
-        is_debug_build=debug_mode_str,
-        python_version="{} ({}-bit runtime)".format(sys_version, sys.maxsize.bit_length() + 1),
-        python_platform=get_python_platform(),
-        is_cuda_available=cuda_available_str,
-        cuda_compiled_version=cuda_version_str,
-        cuda_runtime_version=get_running_cuda_version(run_lambda),
-        cuda_module_loading=get_cuda_module_loading_config(),
-        nvidia_gpu_models=get_gpu_info(run_lambda),
-        nvidia_driver_version=get_nvidia_driver_version(run_lambda),
-        cudnn_version=get_cudnn_version(run_lambda),
-        hip_compiled_version=hip_compiled_version,
-        hip_runtime_version=hip_runtime_version,
-        miopen_runtime_version=miopen_runtime_version,
-        pip_version=pip_version,
-        pip_packages=pip_list_output,
-        conda_packages=conda_packages,
-        os=get_os(run_lambda),
-        libc_version=get_libc_version(),
-        gcc_version=get_gcc_version(run_lambda),
-        clang_version=get_clang_version(run_lambda),
-        cmake_version=get_cmake_version(run_lambda),
-        caching_allocator_config=get_cachingallocator_config(),
-        is_xnnpack_available=is_xnnpack_available(),
-        cpu_info=get_cpu_info(run_lambda),
-        rocm_version=rocm_version,
-        vllm_version=vllm_version,
-        vllm_omni_version=vllm_omni_version,
-        vllm_build_flags=vllm_build_flags,
-        gpu_topo=gpu_topo,
-        env_vars=get_env_vars(),
-    )
-
-
-env_info_fmt = """
-==============================
-        System Info
-==============================
-OS                           : {os}
-GCC version                  : {gcc_version}
-Clang version                : {clang_version}
-CMake version                : {cmake_version}
-Libc version                 : {libc_version}
-
-==============================
-       PyTorch Info
-==============================
-PyTorch version              : {torch_version}
-Is debug build               : {is_debug_build}
-CUDA used to build PyTorch   : {cuda_compiled_version}
-ROCM used to build PyTorch   : {hip_compiled_version}
-
-==============================
-      Python Environment
-==============================
-Python version               : {python_version}
-Python platform              : {python_platform}
-
-==============================
-       CUDA / GPU Info
-==============================
-Is CUDA available            : {is_cuda_available}
-CUDA runtime version         : {cuda_runtime_version}
-CUDA_MODULE_LOADING set to   : {cuda_module_loading}
-GPU models and configuration : {nvidia_gpu_models}
-Nvidia driver version        : {nvidia_driver_version}
-cuDNN version                : {cudnn_version}
-HIP runtime version          : {hip_runtime_version}
-MIOpen runtime version       : {miopen_runtime_version}
-Is XNNPACK available         : {is_xnnpack_available}
-
-==============================
-          CPU Info
-==============================
-{cpu_info}
-
-==============================
-Versions of relevant libraries
-==============================
-{pip_packages}
-{conda_packages}
-""".strip()
-
-# both the above code and the following code use `strip()` to
-# remove leading/trailing whitespaces, so we need to add a newline
-# in between to separate the two sections
-env_info_fmt += "\n\n"
-
-env_info_fmt += """
-==============================
-         vLLM Info
-==============================
-ROCM Version                 : {rocm_version}
-vLLM Version                 : {vllm_version}
-vLLM-Omni Version            : {vllm_omni_version}
-vLLM Build Flags:
-  {vllm_build_flags}
-GPU Topology:
-  {gpu_topo}
-
-==============================
-     Environment Variables
-==============================
-{env_vars}
-""".strip()
-
-
-def pretty_str(envinfo):
-    def replace_nones(dct, replacement="Could not collect"):
-        for key in dct.keys():
-            if dct[key] is not None:
-                continue
-            dct[key] = replacement
-        return dct
-
-    def replace_bools(dct, true="Yes", false="No"):
-        for key in dct.keys():
-            if dct[key] is True:
-                dct[key] = true
-            elif dct[key] is False:
-                dct[key] = false
-        return dct
-
-    def prepend(text, tag="[prepend]"):
-        lines = text.split("\n")
-        updated_lines = [tag + line for line in lines]
-        return "\n".join(updated_lines)
-
-    def replace_if_empty(text, replacement="No relevant packages"):
-        if text is not None and len(text) == 0:
-            return replacement
-        return text
-
-    def maybe_start_on_next_line(string):
-        # If `string` is multiline, prepend a \n to it.
-        if string is not None and len(string.split("\n")) > 1:
-            return "\n{}\n".format(string)
-        return string
-
-    mutable_dict = envinfo._asdict()
-
-    # If nvidia_gpu_models is multiline, start on the next line
-    mutable_dict["nvidia_gpu_models"] = maybe_start_on_next_line(envinfo.nvidia_gpu_models)
-
-    # If the machine doesn't have CUDA, report some fields as 'No CUDA'
-    dynamic_cuda_fields = [
-        "cuda_runtime_version",
-        "nvidia_gpu_models",
-        "nvidia_driver_version",
-    ]
-    all_cuda_fields = dynamic_cuda_fields + ["cudnn_version"]
-    all_dynamic_cuda_fields_missing = all(mutable_dict[field] is None for field in dynamic_cuda_fields)
-    if TORCH_AVAILABLE and not torch.cuda.is_available() and all_dynamic_cuda_fields_missing:
-        for field in all_cuda_fields:
-            mutable_dict[field] = "No CUDA"
-        if envinfo.cuda_compiled_version is None:
-            mutable_dict["cuda_compiled_version"] = "None"
-
-    # Replace True with Yes, False with No
-    mutable_dict = replace_bools(mutable_dict)
-
-    # Replace all None objects with 'Could not collect'
-    mutable_dict = replace_nones(mutable_dict)
-
-    # If either of these are '', replace with 'No relevant packages'
-    mutable_dict["pip_packages"] = replace_if_empty(mutable_dict["pip_packages"])
-    mutable_dict["conda_packages"] = replace_if_empty(mutable_dict["conda_packages"])
-
-    # Tag conda and pip packages with a prefix
-    # If they were previously None, they'll show up as ie '[conda] Could not collect'
-    if mutable_dict["pip_packages"]:
-        mutable_dict["pip_packages"] = prepend(mutable_dict["pip_packages"], "[{}] ".format(envinfo.pip_version))
-    if mutable_dict["conda_packages"]:
-        mutable_dict["conda_packages"] = prepend(mutable_dict["conda_packages"], "[conda] ")
-    mutable_dict["cpu_info"] = envinfo.cpu_info
-    return env_info_fmt.format(**mutable_dict)
-
-
-def get_pretty_env_info():
-    return pretty_str(get_env_info())
-
-
-def main():
-    print("Collecting environment information...")
-    output = get_pretty_env_info()
-    print(output)
-
-    if TORCH_AVAILABLE and hasattr(torch, "utils") and hasattr(torch.utils, "_crash_handler"):
-        minidump_dir = torch.utils._crash_handler.DEFAULT_MINIDUMP_DIR
-        if sys.platform == "linux" and os.path.exists(minidump_dir):
-            dumps = [os.path.join(minidump_dir, dump) for dump in os.listdir(minidump_dir)]
-            latest = max(dumps, key=os.path.getctime)
-            ctime = os.path.getctime(latest)
-            creation_time = datetime.datetime.fromtimestamp(ctime).strftime("%Y-%m-%d %H:%M:%S")
-            msg = (
-                "\n*** Detected a minidump at {} created on {}, ".format(latest, creation_time)
-                + "if this is related to your bug please include it when you file a report ***"
-            )
-            print(msg, file=sys.stderr)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py
index 5dfcdc542f..9b8413d0a2 100644
--- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py
+++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py
@@ -88,6 +88,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         # Store vllm_config for potential future use
         self.vllm_config = vllm_config
 
+    @staticmethod
+    def extract_val(d, key, default):
+        val = d.get(key, default)
+        if isinstance(val, list):
+            return val[0] if len(val) > 0 else default
+        return val
+
     def forward(
         self,
         input_ids: torch.Tensor | None = None,
@@ -115,18 +122,11 @@ def forward(
         batched_keys = {"ref_audio", "ref_text", "x_vector_only_mode", "voice_clone_prompt"}
 
         for req_info in runtime_info_list:
-
-            def extract_val(d, key, default):
-                val = d.get(key, default)
-                if isinstance(val, list):
-                    return val[0] if len(val) > 0 else default
-                return val
-
-            texts.append(extract_val(req_info, "text", ""))
-            task_types.append(extract_val(req_info, "task_type", self.task_type))
-            speakers.append(extract_val(req_info, "speaker", "uncle_fu"))
-            languages.append(extract_val(req_info, "language", "Auto"))
-            instructs.append(extract_val(req_info, "instruct", ""))
+            texts.append(self.extract_val(req_info, "text", ""))
+            task_types.append(self.extract_val(req_info, "task_type", self.task_type))
+            speakers.append(self.extract_val(req_info, "speaker", "uncle_fu"))
+            languages.append(self.extract_val(req_info, "language", "Auto"))
+            instructs.append(self.extract_val(req_info, "instruct", ""))
 
             for k, v in req_info.items():
                 if k not in ["text", "task_type", "speaker", "language", "instruct"]:
diff --git a/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml b/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml
index 09cf2a4ccc..4fcd8c38ef 100644
--- a/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml
+++ b/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml
@@ -3,7 +3,7 @@ stage_args:
     stage_type: llm  # Use llm stage type to launch OmniLLM
     runtime:
       devices: "0"
-      max_batch_size: 100
+      max_batch_size: 10
     engine_args:
       model_stage: qwen3_tts
       model_arch: Qwen3TTSForConditionalGeneration
@@ -14,7 +14,7 @@ stage_args:
       async_scheduling: false
       enable_prefix_caching: false
       engine_output_type: audio  # Final output: audio waveform
-      gpu_memory_utilization: 0.8
+      gpu_memory_utilization: 0.5
       distributed_executor_backend: "mp"
       max_num_batched_tokens: 1000000
 

From b8daf11e6c185fedbcd69bee8fddd240d921f351 Mon Sep 17 00:00:00 2001
From: Roman Koshkin <roman.koshkin@sbintuitions.co.jp>
Date: Sat, 21 Feb 2026 16:22:36 +0900
Subject: [PATCH 3/4] [Bugfix] [cleanup2](qwen3_tts): enable batched offline
 inference by fixing tensor slicing

Signed-off-by: Roman Koshkin <roman.koshkin@sbintuitions.co.jp>
---
 vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py
index 9b8413d0a2..9a0a993a60 100644
--- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py
+++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py
@@ -149,6 +149,8 @@ def forward(
             merged_kwargs["max_new_tokens"] = 2
 
         # Assume uniform task type across the batch
+        if len(set(task_types)) > 1:
+            raise ValueError(f"Mixed task types not supported: {set(task_types)}")
         task_type = task_types[0]
 
         # Call the appropriate generation method based on task_type, passing lists

From 8b80aeb753c65ae4f5e0659aecc4b81dcdc01d90 Mon Sep 17 00:00:00 2001
From: Roman Koshkin <roman.koshkin@sbintuitions.co.jp>
Date: Sat, 21 Feb 2026 16:29:44 +0900
Subject: [PATCH 4/4] [Bugfix] [cleanup3](qwen3_tts): enable batched offline
 inference by fixing tensor slicing

Signed-off-by: Roman Koshkin <roman.koshkin@sbintuitions.co.jp>
---
 .gitignore | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 214c8efb42..12486f4a7f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -245,5 +245,3 @@ tmp_test
 
 # output files
 *.wav
-examples/offline_inference/qwen3_tts/test.py
-examples/online_serving/qwen3_tts/Untitled.ipynb