From 6dd5ad26bdd72904e6c5bd97928f64c1deb65c67 Mon Sep 17 00:00:00 2001 From: Roman Koshkin Date: Sat, 21 Feb 2026 09:06:34 +0900 Subject: [PATCH 1/4] [Bugfix] (qwen3_tts): enable batched offline inference by fixing tensor slicing Signed-off-by: Roman Koshkin --- .gitignore | 2 + READMEmy.md | 25 + .../qwen3_tts/collect_env.py | 760 ++++++++++++++++++ .../offline_inference/qwen3_tts/end2end.py | 14 +- .../models/qwen3_tts/qwen3_tts.py | 113 +-- .../stage_configs/qwen3_tts.yaml | 4 +- 6 files changed, 865 insertions(+), 53 deletions(-) create mode 100644 READMEmy.md create mode 100644 examples/offline_inference/qwen3_tts/collect_env.py diff --git a/.gitignore b/.gitignore index 12486f4a7f..214c8efb42 100644 --- a/.gitignore +++ b/.gitignore @@ -245,3 +245,5 @@ tmp_test # output files *.wav +examples/offline_inference/qwen3_tts/test.py +examples/online_serving/qwen3_tts/Untitled.ipynb diff --git a/READMEmy.md b/READMEmy.md new file mode 100644 index 0000000000..31dc180d61 --- /dev/null +++ b/READMEmy.md @@ -0,0 +1,25 @@ + + +```bash +cd /lustre/users/rkoshkin +git clone https://github.com/vllm-project/vllm-omni.git +cd vllm-omni +uv venv --python 3.10 --seed +source .venv/bin/activate +cd .. +git clone https://github.com/vllm-project/vllm.git +cd vllm +git checkout v0.16.0 +export VLLM_PRECOMPILED_WHEEL_LOCATION=https://github.com/vllm-project/vllm/releases/download/v0.16.0/vllm-0.16.0-cp38-abi3-manylinux_2_31_x86_64.whl +uv pip install -e . +cd ../vllm-omni +uv pip install -e . +``` + + + +```bash +# edit /lustre/users/rkoshkin/vllm-omni/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml AS NECESSARY +cd examples/online_serving/qwen3_tts +./run_server.sh Base +``` diff --git a/examples/offline_inference/qwen3_tts/collect_env.py b/examples/offline_inference/qwen3_tts/collect_env.py new file mode 100644 index 0000000000..8b09379e1a --- /dev/null +++ b/examples/offline_inference/qwen3_tts/collect_env.py @@ -0,0 +1,760 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# ruff: noqa +# code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py + +import datetime +import locale +import os +import subprocess +import sys + +# Unlike the rest of the PyTorch this file must be python2 compliant. +# This script outputs relevant system environment info +# Run it with `python collect_env.py` or `python -m torch.utils.collect_env` +from collections import namedtuple + +import regex as re + +from vllm.envs import environment_variables + +try: + import torch + + TORCH_AVAILABLE = True +except (ImportError, NameError, AttributeError, OSError): + TORCH_AVAILABLE = False + +# System Environment Information +SystemEnv = namedtuple( + "SystemEnv", + [ + "torch_version", + "is_debug_build", + "cuda_compiled_version", + "gcc_version", + "clang_version", + "cmake_version", + "os", + "libc_version", + "python_version", + "python_platform", + "is_cuda_available", + "cuda_runtime_version", + "cuda_module_loading", + "nvidia_driver_version", + "nvidia_gpu_models", + "cudnn_version", + "pip_version", # 'pip' or 'pip3' + "pip_packages", + "conda_packages", + "hip_compiled_version", + "hip_runtime_version", + "miopen_runtime_version", + "caching_allocator_config", + "is_xnnpack_available", + "cpu_info", + "rocm_version", # vllm specific field + "vllm_version", # vllm specific field + "vllm_omni_version", # vllm-omni specific field + "vllm_build_flags", # vllm specific field + "gpu_topo", # vllm specific field + "env_vars", + ], +) + +DEFAULT_CONDA_PATTERNS = { + "torch", + "numpy", + "cudatoolkit", + "soumith", + "mkl", + "magma", + "triton", + "optree", + "nccl", + "transformers", + "zmq", + "nvidia", + "pynvml", + "flashinfer-python", +} + +DEFAULT_PIP_PATTERNS = { + "torch", + "numpy", + "mypy", + "flake8", + "triton", + "optree", + "onnx", + "nccl", + "transformers", + "zmq", + "nvidia", + "pynvml", + "flashinfer-python", +} + + +def run(command): + """Return (return-code, stdout, stderr).""" + shell = True if type(command) is str else False + try: + p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell) + raw_output, raw_err = p.communicate() + rc = p.returncode + if get_platform() == "win32": + enc = "oem" + else: + enc = locale.getpreferredencoding() + output = raw_output.decode(enc) + if command == "nvidia-smi topo -m": + # don't remove the leading whitespace of `nvidia-smi topo -m` + # because they are meaningful + output = output.rstrip() + else: + output = output.strip() + err = raw_err.decode(enc) + return rc, output, err.strip() + + except FileNotFoundError: + cmd_str = command if isinstance(command, str) else command[0] + return 127, "", f"Command not found: {cmd_str}" + + +def run_and_read_all(run_lambda, command): + """Run command using run_lambda; reads and returns entire output if rc is 0.""" + rc, out, _ = run_lambda(command) + if rc != 0: + return None + return out + + +def run_and_parse_first_match(run_lambda, command, regex): + """Run command using run_lambda, returns the first regex match if it exists.""" + rc, out, _ = run_lambda(command) + if rc != 0: + return None + match = re.search(regex, out) + if match is None: + return None + return match.group(1) + + +def run_and_return_first_line(run_lambda, command): + """Run command using run_lambda and returns first line if output is not empty.""" + rc, out, _ = run_lambda(command) + if rc != 0: + return None + return out.split("\n")[0] + + +def get_conda_packages(run_lambda, patterns=None): + if patterns is None: + patterns = DEFAULT_CONDA_PATTERNS + conda = os.environ.get("CONDA_EXE", "conda") + out = run_and_read_all(run_lambda, [conda, "list"]) + if out is None: + return out + + return "\n".join( + line for line in out.splitlines() if not line.startswith("#") and any(name in line for name in patterns) + ) + + +def get_gcc_version(run_lambda): + return run_and_parse_first_match(run_lambda, "gcc --version", r"gcc (.*)") + + +def get_clang_version(run_lambda): + return run_and_parse_first_match(run_lambda, "clang --version", r"clang version (.*)") + + +def get_cmake_version(run_lambda): + return run_and_parse_first_match(run_lambda, "cmake --version", r"cmake (.*)") + + +def get_nvidia_driver_version(run_lambda): + if get_platform() == "darwin": + cmd = "kextstat | grep -i cuda" + return run_and_parse_first_match(run_lambda, cmd, r"com[.]nvidia[.]CUDA [(](.*?)[)]") + smi = get_nvidia_smi() + return run_and_parse_first_match(run_lambda, smi, r"Driver Version: (.*?) ") + + +def get_gpu_info(run_lambda): + if get_platform() == "darwin" or ( + TORCH_AVAILABLE and hasattr(torch.version, "hip") and torch.version.hip is not None + ): + if TORCH_AVAILABLE and torch.cuda.is_available(): + if torch.version.hip is not None: + prop = torch.cuda.get_device_properties(0) + if hasattr(prop, "gcnArchName"): + gcnArch = " ({})".format(prop.gcnArchName) + else: + gcnArch = "NoGCNArchNameOnOldPyTorch" + else: + gcnArch = "" + return torch.cuda.get_device_name(None) + gcnArch + return None + smi = get_nvidia_smi() + uuid_regex = re.compile(r" \(UUID: .+?\)") + rc, out, _ = run_lambda(smi + " -L") + if rc != 0: + return None + # Anonymize GPUs by removing their UUID + return re.sub(uuid_regex, "", out) + + +def get_running_cuda_version(run_lambda): + return run_and_parse_first_match(run_lambda, "nvcc --version", r"release .+ V(.*)") + + +def get_cudnn_version(run_lambda): + """Return a list of libcudnn.so; it's hard to tell which one is being used.""" + if get_platform() == "win32": + system_root = os.environ.get("SYSTEMROOT", "C:\\Windows") + cuda_path = os.environ.get("CUDA_PATH", "%CUDA_PATH%") + where_cmd = os.path.join(system_root, "System32", "where") + cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path) + elif get_platform() == "darwin": + # CUDA libraries and drivers can be found in /usr/local/cuda/. See + # https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#install + # https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installmac + # Use CUDNN_LIBRARY when cudnn library is installed elsewhere. + cudnn_cmd = "ls /usr/local/cuda/lib/libcudnn*" + else: + cudnn_cmd = 'ldconfig -p | grep libcudnn | rev | cut -d" " -f1 | rev' + rc, out, _ = run_lambda(cudnn_cmd) + # find will return 1 if there are permission errors or if not found + if len(out) == 0 or (rc != 1 and rc != 0): + l = os.environ.get("CUDNN_LIBRARY") + if l is not None and os.path.isfile(l): + return os.path.realpath(l) + return None + files_set = set() + for fn in out.split("\n"): + fn = os.path.realpath(fn) # eliminate symbolic links + if os.path.isfile(fn): + files_set.add(fn) + if not files_set: + return None + # Alphabetize the result because the order is non-deterministic otherwise + files = sorted(files_set) + if len(files) == 1: + return files[0] + result = "\n".join(files) + return "Probably one of the following:\n{}".format(result) + + +def get_nvidia_smi(): + # Note: nvidia-smi is currently available only on Windows and Linux + smi = "nvidia-smi" + if get_platform() == "win32": + system_root = os.environ.get("SYSTEMROOT", "C:\\Windows") + program_files_root = os.environ.get("PROGRAMFILES", "C:\\Program Files") + legacy_path = os.path.join(program_files_root, "NVIDIA Corporation", "NVSMI", smi) + new_path = os.path.join(system_root, "System32", smi) + smis = [new_path, legacy_path] + for candidate_smi in smis: + if os.path.exists(candidate_smi): + smi = '"{}"'.format(candidate_smi) + break + return smi + + +def get_rocm_version(run_lambda): + """Returns the ROCm version if available, otherwise 'N/A'.""" + return run_and_parse_first_match(run_lambda, "hipcc --version", r"HIP version: (\S+)") + + +def get_vllm_version(): + from vllm import __version__, __version_tuple__ + + if __version__ == "dev": + return "N/A (dev)" + version_str = __version_tuple__[-1] + if isinstance(version_str, str) and version_str.startswith("g"): + # it's a dev build + if "." in version_str: + # it's a dev build containing local changes + git_sha = version_str.split(".")[0][1:] + date = version_str.split(".")[-1][1:] + return f"{__version__} (git sha: {git_sha}, date: {date})" + else: + # it's a dev build without local changes + git_sha = version_str[1:] # type: ignore + return f"{__version__} (git sha: {git_sha})" + return __version__ + + +def get_vllm_omni_version(run_lambda): + try: + import vllm_omni + from vllm_omni import __version__, __version_tuple__ + + version_str = __version_tuple__[-1] + if isinstance(version_str, str) and version_str.startswith("g"): + if "." in version_str: + git_sha = version_str.split(".")[0][1:] + date = version_str.split(".")[-1][1:] + return f"{__version__} (git sha: {git_sha}, date: {date})" + else: + git_sha = version_str[1:] + return f"{__version__} (git sha: {git_sha})" + + package_dir = os.path.dirname(os.path.abspath(vllm_omni.__file__)) + git_sha = run_and_read_all(run_lambda, f"git -C {package_dir} rev-parse --short HEAD") + if git_sha: + return f"{__version__} (git sha: {git_sha})" + + return __version__ + except ImportError: + return "N/A (vllm_omni not installed)" + + +def summarize_vllm_build_flags(): + # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc. + return "CUDA Archs: {}; ROCm: {}".format( + os.environ.get("TORCH_CUDA_ARCH_LIST", "Not Set"), + "Enabled" if os.environ.get("ROCM_HOME") else "Disabled", + ) + + +def get_gpu_topo(run_lambda): + output = None + + if get_platform() == "linux": + output = run_and_read_all(run_lambda, "nvidia-smi topo -m") + if output is None: + output = run_and_read_all(run_lambda, "rocm-smi --showtopo") + + return output + + +def get_cpu_info(run_lambda): + rc, out, err = 0, "", "" + if get_platform() == "linux": + rc, out, err = run_lambda("lscpu") + elif get_platform() == "win32": + rc, out, err = run_lambda( + "wmic cpu get Name,Manufacturer,Family,Architecture,ProcessorType,DeviceID, \ + CurrentClockSpeed,MaxClockSpeed,L2CacheSize,L2CacheSpeed,Revision /VALUE" + ) + elif get_platform() == "darwin": + rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string") + cpu_info = "None" + if rc == 0: + cpu_info = out + else: + cpu_info = err + return cpu_info + + +def get_platform(): + if sys.platform.startswith("linux"): + return "linux" + elif sys.platform.startswith("win32"): + return "win32" + elif sys.platform.startswith("cygwin"): + return "cygwin" + elif sys.platform.startswith("darwin"): + return "darwin" + else: + return sys.platform + + +def get_mac_version(run_lambda): + return run_and_parse_first_match(run_lambda, "sw_vers -productVersion", r"(.*)") + + +def get_windows_version(run_lambda): + system_root = os.environ.get("SYSTEMROOT", "C:\\Windows") + wmic_cmd = os.path.join(system_root, "System32", "Wbem", "wmic") + findstr_cmd = os.path.join(system_root, "System32", "findstr") + return run_and_read_all(run_lambda, "{} os get Caption | {} /v Caption".format(wmic_cmd, findstr_cmd)) + + +def get_lsb_version(run_lambda): + return run_and_parse_first_match(run_lambda, "lsb_release -a", r"Description:\t(.*)") + + +def check_release_file(run_lambda): + return run_and_parse_first_match(run_lambda, "cat /etc/*-release", r'PRETTY_NAME="(.*)"') + + +def get_os(run_lambda): + from platform import machine + + platform = get_platform() + + if platform == "win32" or platform == "cygwin": + return get_windows_version(run_lambda) + + if platform == "darwin": + version = get_mac_version(run_lambda) + if version is None: + return None + return "macOS {} ({})".format(version, machine()) + + if platform == "linux": + # Ubuntu/Debian based + desc = get_lsb_version(run_lambda) + if desc is not None: + return "{} ({})".format(desc, machine()) + + # Try reading /etc/*-release + desc = check_release_file(run_lambda) + if desc is not None: + return "{} ({})".format(desc, machine()) + + return "{} ({})".format(platform, machine()) + + # Unknown platform + return platform + + +def get_python_platform(): + import platform + + return platform.platform() + + +def get_libc_version(): + import platform + + if get_platform() != "linux": + return "N/A" + return "-".join(platform.libc_ver()) + + +def is_uv_venv(): + if os.environ.get("UV"): + return True + pyvenv_cfg_path = os.path.join(sys.prefix, "pyvenv.cfg") + if os.path.exists(pyvenv_cfg_path): + with open(pyvenv_cfg_path, "r") as f: + return any(line.startswith("uv = ") for line in f) + return False + + +def get_pip_packages(run_lambda, patterns=None): + """Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages.""" + if patterns is None: + patterns = DEFAULT_PIP_PATTERNS + + def run_with_pip(): + try: + import importlib.util + + pip_spec = importlib.util.find_spec("pip") + pip_available = pip_spec is not None + except ImportError: + pip_available = False + + if pip_available: + cmd = [sys.executable, "-mpip", "list", "--format=freeze"] + elif is_uv_venv(): + print("uv is set") + cmd = ["uv", "pip", "list", "--format=freeze"] + else: + raise RuntimeError("Could not collect pip list output (pip or uv module not available)") + + out = run_and_read_all(run_lambda, cmd) + return "\n".join(line for line in out.splitlines() if any(name in line for name in patterns)) + + pip_version = "pip3" if sys.version[0] == "3" else "pip" + out = run_with_pip() + return pip_version, out + + +def get_cachingallocator_config(): + ca_config = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "") + return ca_config + + +def get_cuda_module_loading_config(): + if TORCH_AVAILABLE and torch.cuda.is_available(): + torch.cuda.init() + config = os.environ.get("CUDA_MODULE_LOADING", "") + return config + else: + return "N/A" + + +def is_xnnpack_available(): + if TORCH_AVAILABLE: + import torch.backends.xnnpack + + return str(torch.backends.xnnpack.enabled) # type: ignore[attr-defined] + else: + return "N/A" + + +def get_env_vars(): + env_vars = "" + secret_terms = ("secret", "token", "api", "access", "password") + report_prefix = ( + "TORCH", + "NCCL", + "PYTORCH", + "CUDA", + "CUBLAS", + "CUDNN", + "OMP_", + "MKL_", + "NVIDIA", + ) + for k, v in os.environ.items(): + if any(term in k.lower() for term in secret_terms): + continue + if k in environment_variables: + env_vars = env_vars + "{}={}".format(k, v) + "\n" + if k.startswith(report_prefix): + env_vars = env_vars + "{}={}".format(k, v) + "\n" + + return env_vars + + +def get_env_info(): + run_lambda = run + pip_version, pip_list_output = get_pip_packages(run_lambda) + + if TORCH_AVAILABLE: + version_str = torch.__version__ + debug_mode_str = str(torch.version.debug) + cuda_available_str = str(torch.cuda.is_available()) + cuda_version_str = torch.version.cuda + if not hasattr(torch.version, "hip") or torch.version.hip is None: # cuda version + hip_compiled_version = hip_runtime_version = miopen_runtime_version = "N/A" + else: # HIP version + + def get_version_or_na(cfg, prefix): + _lst = [s.rsplit(None, 1)[-1] for s in cfg if prefix in s] + return _lst[0] if _lst else "N/A" + + cfg = torch._C._show_config().split("\n") + hip_runtime_version = get_version_or_na(cfg, "HIP Runtime") + miopen_runtime_version = get_version_or_na(cfg, "MIOpen") + cuda_version_str = "N/A" + hip_compiled_version = torch.version.hip + else: + version_str = debug_mode_str = cuda_available_str = cuda_version_str = "N/A" + hip_compiled_version = hip_runtime_version = miopen_runtime_version = "N/A" + + sys_version = sys.version.replace("\n", " ") + + conda_packages = get_conda_packages(run_lambda) + + rocm_version = get_rocm_version(run_lambda) + vllm_version = get_vllm_version() + vllm_omni_version = get_vllm_omni_version(run_lambda) + vllm_build_flags = summarize_vllm_build_flags() + gpu_topo = get_gpu_topo(run_lambda) + + return SystemEnv( + torch_version=version_str, + is_debug_build=debug_mode_str, + python_version="{} ({}-bit runtime)".format(sys_version, sys.maxsize.bit_length() + 1), + python_platform=get_python_platform(), + is_cuda_available=cuda_available_str, + cuda_compiled_version=cuda_version_str, + cuda_runtime_version=get_running_cuda_version(run_lambda), + cuda_module_loading=get_cuda_module_loading_config(), + nvidia_gpu_models=get_gpu_info(run_lambda), + nvidia_driver_version=get_nvidia_driver_version(run_lambda), + cudnn_version=get_cudnn_version(run_lambda), + hip_compiled_version=hip_compiled_version, + hip_runtime_version=hip_runtime_version, + miopen_runtime_version=miopen_runtime_version, + pip_version=pip_version, + pip_packages=pip_list_output, + conda_packages=conda_packages, + os=get_os(run_lambda), + libc_version=get_libc_version(), + gcc_version=get_gcc_version(run_lambda), + clang_version=get_clang_version(run_lambda), + cmake_version=get_cmake_version(run_lambda), + caching_allocator_config=get_cachingallocator_config(), + is_xnnpack_available=is_xnnpack_available(), + cpu_info=get_cpu_info(run_lambda), + rocm_version=rocm_version, + vllm_version=vllm_version, + vllm_omni_version=vllm_omni_version, + vllm_build_flags=vllm_build_flags, + gpu_topo=gpu_topo, + env_vars=get_env_vars(), + ) + + +env_info_fmt = """ +============================== + System Info +============================== +OS : {os} +GCC version : {gcc_version} +Clang version : {clang_version} +CMake version : {cmake_version} +Libc version : {libc_version} + +============================== + PyTorch Info +============================== +PyTorch version : {torch_version} +Is debug build : {is_debug_build} +CUDA used to build PyTorch : {cuda_compiled_version} +ROCM used to build PyTorch : {hip_compiled_version} + +============================== + Python Environment +============================== +Python version : {python_version} +Python platform : {python_platform} + +============================== + CUDA / GPU Info +============================== +Is CUDA available : {is_cuda_available} +CUDA runtime version : {cuda_runtime_version} +CUDA_MODULE_LOADING set to : {cuda_module_loading} +GPU models and configuration : {nvidia_gpu_models} +Nvidia driver version : {nvidia_driver_version} +cuDNN version : {cudnn_version} +HIP runtime version : {hip_runtime_version} +MIOpen runtime version : {miopen_runtime_version} +Is XNNPACK available : {is_xnnpack_available} + +============================== + CPU Info +============================== +{cpu_info} + +============================== +Versions of relevant libraries +============================== +{pip_packages} +{conda_packages} +""".strip() + +# both the above code and the following code use `strip()` to +# remove leading/trailing whitespaces, so we need to add a newline +# in between to separate the two sections +env_info_fmt += "\n\n" + +env_info_fmt += """ +============================== + vLLM Info +============================== +ROCM Version : {rocm_version} +vLLM Version : {vllm_version} +vLLM-Omni Version : {vllm_omni_version} +vLLM Build Flags: + {vllm_build_flags} +GPU Topology: + {gpu_topo} + +============================== + Environment Variables +============================== +{env_vars} +""".strip() + + +def pretty_str(envinfo): + def replace_nones(dct, replacement="Could not collect"): + for key in dct.keys(): + if dct[key] is not None: + continue + dct[key] = replacement + return dct + + def replace_bools(dct, true="Yes", false="No"): + for key in dct.keys(): + if dct[key] is True: + dct[key] = true + elif dct[key] is False: + dct[key] = false + return dct + + def prepend(text, tag="[prepend]"): + lines = text.split("\n") + updated_lines = [tag + line for line in lines] + return "\n".join(updated_lines) + + def replace_if_empty(text, replacement="No relevant packages"): + if text is not None and len(text) == 0: + return replacement + return text + + def maybe_start_on_next_line(string): + # If `string` is multiline, prepend a \n to it. + if string is not None and len(string.split("\n")) > 1: + return "\n{}\n".format(string) + return string + + mutable_dict = envinfo._asdict() + + # If nvidia_gpu_models is multiline, start on the next line + mutable_dict["nvidia_gpu_models"] = maybe_start_on_next_line(envinfo.nvidia_gpu_models) + + # If the machine doesn't have CUDA, report some fields as 'No CUDA' + dynamic_cuda_fields = [ + "cuda_runtime_version", + "nvidia_gpu_models", + "nvidia_driver_version", + ] + all_cuda_fields = dynamic_cuda_fields + ["cudnn_version"] + all_dynamic_cuda_fields_missing = all(mutable_dict[field] is None for field in dynamic_cuda_fields) + if TORCH_AVAILABLE and not torch.cuda.is_available() and all_dynamic_cuda_fields_missing: + for field in all_cuda_fields: + mutable_dict[field] = "No CUDA" + if envinfo.cuda_compiled_version is None: + mutable_dict["cuda_compiled_version"] = "None" + + # Replace True with Yes, False with No + mutable_dict = replace_bools(mutable_dict) + + # Replace all None objects with 'Could not collect' + mutable_dict = replace_nones(mutable_dict) + + # If either of these are '', replace with 'No relevant packages' + mutable_dict["pip_packages"] = replace_if_empty(mutable_dict["pip_packages"]) + mutable_dict["conda_packages"] = replace_if_empty(mutable_dict["conda_packages"]) + + # Tag conda and pip packages with a prefix + # If they were previously None, they'll show up as ie '[conda] Could not collect' + if mutable_dict["pip_packages"]: + mutable_dict["pip_packages"] = prepend(mutable_dict["pip_packages"], "[{}] ".format(envinfo.pip_version)) + if mutable_dict["conda_packages"]: + mutable_dict["conda_packages"] = prepend(mutable_dict["conda_packages"], "[conda] ") + mutable_dict["cpu_info"] = envinfo.cpu_info + return env_info_fmt.format(**mutable_dict) + + +def get_pretty_env_info(): + return pretty_str(get_env_info()) + + +def main(): + print("Collecting environment information...") + output = get_pretty_env_info() + print(output) + + if TORCH_AVAILABLE and hasattr(torch, "utils") and hasattr(torch.utils, "_crash_handler"): + minidump_dir = torch.utils._crash_handler.DEFAULT_MINIDUMP_DIR + if sys.platform == "linux" and os.path.exists(minidump_dir): + dumps = [os.path.join(minidump_dir, dump) for dump in os.listdir(minidump_dir)] + latest = max(dumps, key=os.path.getctime) + ctime = os.path.getctime(latest) + creation_time = datetime.datetime.fromtimestamp(ctime).strftime("%Y-%m-%d %H:%M:%S") + msg = ( + "\n*** Detected a minidump at {} created on {}, ".format(latest, creation_time) + + "if this is related to your bug please include it when you file a report ***" + ) + print(msg, file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/qwen3_tts/end2end.py b/examples/offline_inference/qwen3_tts/end2end.py index 93aeba3ca5..ddb57869f9 100644 --- a/examples/offline_inference/qwen3_tts/end2end.py +++ b/examples/offline_inference/qwen3_tts/end2end.py @@ -35,10 +35,16 @@ def get_custom_voice_query(use_batch_sample: bool = False) -> QueryResult: """ task_type = "CustomVoice" if use_batch_sample: - texts = ["其实我真的有发现,我是一个特别善于观察别人情绪的人。", "She said she would be here by noon."] - instructs = ["", "Very happy."] - languages = ["Chinese", "English"] - speakers = ["Vivian", "Ryan"] + texts = [ + "其实我真的有发现,我是一个特别善于观察别人情绪的人。", + "She said she would be here by noon.", + "I like you very much.", + "Really, you do?", + "Yes, absolutely.", + ] + instructs = ["", "Very happy.", "Very happy.", "Very happy.", "Very happy."] + languages = ["Chinese", "English", "English", "English", "English"] + speakers = ["Vivian", "Ryan", "Ryan", "Ryan", "Ryan"] inputs = [] for text, instruct, language, speaker in zip(texts, instructs, languages, speakers): prompt = f"<|im_start|>assistant\n{text}<|im_end|>\n<|im_start|>assistant\n" diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py index 664d2a2957..5dfcdc542f 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py +++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py @@ -97,53 +97,69 @@ def forward( **kwargs: Any, ) -> OmniOutput: """ - Forward pass for TTS generation model. - - Args: - input_ids: Input token IDs (required for TTS generation) - positions: Position IDs (not used for TTS, but required by runner) - intermediate_tensors: Intermediate tensors for pipeline parallelism (not used) - inputs_embeds: Input embeddings (not used for TTS, but required by runner) - **kwargs: Additional arguments including task_type, sampling_metadata, etc. - - Returns: - OmniOutput: Contains multimodal outputs with audio tensors + Forward pass for TTS generation model (Patched for batched inference). """ - - # Extract additional parameters from kwargs that the generation methods expect - - runtime_additional_information = kwargs.get("runtime_additional_information", [{}]) - if isinstance(runtime_additional_information, list) and len(runtime_additional_information) > 0: - runtime_additional_information = runtime_additional_information[0] - text = runtime_additional_information.pop("text", [""])[0] - # Extract task_type from kwargs, default to "instruct" - task_type = runtime_additional_information.pop("task_type", [self.task_type])[0] - speaker = runtime_additional_information.pop("speaker", ["uncle_fu"])[0] - language = runtime_additional_information.pop("language", ["Auto"])[0] - instruct = runtime_additional_information.pop("instruct", [""])[0] - for key, value in runtime_additional_information.items(): - if isinstance(value, list) and len(value) > 0: - runtime_additional_information[key] = value[0] - - # During profile/warmup runs, text is empty and no real inputs exist. - # Cap generation steps so the full pipeline executes (preserving - # KV-cache profiling behaviour) but exits quickly even if the model - # cannot converge from degenerate dummy inputs. - if not text: + runtime_info_list = kwargs.get("runtime_additional_information", [{}]) + if not isinstance(runtime_info_list, list): + runtime_info_list = [runtime_info_list] + + # Initialize lists to accumulate batched inputs + texts = [] + task_types = [] + speakers = [] + languages = [] + instructs = [] + merged_kwargs = {} + + # Keys that the underlying model natively supports as lists for batched inference + batched_keys = {"ref_audio", "ref_text", "x_vector_only_mode", "voice_clone_prompt"} + + for req_info in runtime_info_list: + + def extract_val(d, key, default): + val = d.get(key, default) + if isinstance(val, list): + return val[0] if len(val) > 0 else default + return val + + texts.append(extract_val(req_info, "text", "")) + task_types.append(extract_val(req_info, "task_type", self.task_type)) + speakers.append(extract_val(req_info, "speaker", "uncle_fu")) + languages.append(extract_val(req_info, "language", "Auto")) + instructs.append(extract_val(req_info, "instruct", "")) + + for k, v in req_info.items(): + if k not in ["text", "task_type", "speaker", "language", "instruct"]: + # Extract single value from list if wrapped + val = v[0] if isinstance(v, list) and len(v) > 0 else v + + if k in batched_keys: + # Accumulate as list for batched generation + if k not in merged_kwargs: + merged_kwargs[k] = [] + merged_kwargs[k].append(val) + else: + # For scalar params (e.g. max_new_tokens), take from the first request + if k not in merged_kwargs: + merged_kwargs[k] = val + + # During profile/warmup runs, texts are empty. + if all(not t for t in texts): logger.info("Profile run detected (empty text). Capping max_new_tokens to 2.") - runtime_additional_information["max_new_tokens"] = 2 + merged_kwargs["max_new_tokens"] = 2 + + # Assume uniform task type across the batch + task_type = task_types[0] - # Call the appropriate generation method based on task_type + # Call the appropriate generation method based on task_type, passing lists if task_type == "CustomVoice": result = self.model.generate_custom_voice( - text, speaker=speaker, language=language, instruct=instruct, **runtime_additional_information + texts, speaker=speakers, language=languages, instruct=instructs, **merged_kwargs ) elif task_type == "VoiceDesign": - result = self.model.generate_voice_design( - text, instruct=instruct, language=language, **runtime_additional_information - ) + result = self.model.generate_voice_design(texts, instruct=instructs, language=languages, **merged_kwargs) elif task_type == "Base": - result = self.model.generate_voice_clone(text, language=language, **runtime_additional_information) + result = self.model.generate_voice_clone(texts, language=languages, **merged_kwargs) else: raise ValueError(f"Invalid task type: {task_type}") @@ -162,17 +178,20 @@ def make_omni_output(self, model_outputs: torch.Tensor | OmniOutput | tuple, **k # Handle tuple format: (audio_tensors, sample_rate) if isinstance(model_outputs, tuple) and len(model_outputs) == 2: audio_tensors, sr = model_outputs - # audio_tensors is a list of numpy arrays, convert first one to tensor if needed + # audio_tensors is a list of numpy arrays, convert ALL to tensors if isinstance(audio_tensors, list) and len(audio_tensors) > 0: - # Convert numpy array to tensor if needed - audio_tensor = audio_tensors[0] - if isinstance(audio_tensor, np.ndarray): - audio_tensor = torch.from_numpy(audio_tensor).float() - elif not isinstance(audio_tensor, torch.Tensor): - audio_tensor = torch.tensor(audio_tensor, dtype=torch.float32) + audio_tensor_list = [] + for audio_tensor in audio_tensors: + if isinstance(audio_tensor, np.ndarray): + audio_tensor_list.append(torch.from_numpy(audio_tensor).float()) + elif not isinstance(audio_tensor, torch.Tensor): + audio_tensor_list.append(torch.tensor(audio_tensor, dtype=torch.float32)) + else: + audio_tensor_list.append(audio_tensor) + return OmniOutput( text_hidden_states=None, - multimodal_outputs={"model_outputs": audio_tensor, "sr": torch.tensor(sr, dtype=torch.int)}, + multimodal_outputs={"model_outputs": audio_tensor_list, "sr": torch.tensor(sr, dtype=torch.int)}, ) # If it's already a tensor, wrap it diff --git a/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml b/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml index d408dbab91..09cf2a4ccc 100644 --- a/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml +++ b/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml @@ -3,7 +3,7 @@ stage_args: stage_type: llm # Use llm stage type to launch OmniLLM runtime: devices: "0" - max_batch_size: 1 + max_batch_size: 100 engine_args: model_stage: qwen3_tts model_arch: Qwen3TTSForConditionalGeneration @@ -14,7 +14,7 @@ stage_args: async_scheduling: false enable_prefix_caching: false engine_output_type: audio # Final output: audio waveform - gpu_memory_utilization: 0.1 + gpu_memory_utilization: 0.8 distributed_executor_backend: "mp" max_num_batched_tokens: 1000000 From 732453e1a50a86fdbea04fcec50c601b1ea84826 Mon Sep 17 00:00:00 2001 From: Roman Koshkin Date: Sat, 21 Feb 2026 15:38:31 +0900 Subject: [PATCH 2/4] [Bugfix] [cleanup](qwen3_tts): enable batched offline inference by fixing tensor slicing Signed-off-by: Roman Koshkin --- READMEmy.md | 25 - .../qwen3_tts/collect_env.py | 760 ------------------ .../models/qwen3_tts/qwen3_tts.py | 24 +- .../stage_configs/qwen3_tts.yaml | 4 +- 4 files changed, 14 insertions(+), 799 deletions(-) delete mode 100644 READMEmy.md delete mode 100644 examples/offline_inference/qwen3_tts/collect_env.py diff --git a/READMEmy.md b/READMEmy.md deleted file mode 100644 index 31dc180d61..0000000000 --- a/READMEmy.md +++ /dev/null @@ -1,25 +0,0 @@ - - -```bash -cd /lustre/users/rkoshkin -git clone https://github.com/vllm-project/vllm-omni.git -cd vllm-omni -uv venv --python 3.10 --seed -source .venv/bin/activate -cd .. -git clone https://github.com/vllm-project/vllm.git -cd vllm -git checkout v0.16.0 -export VLLM_PRECOMPILED_WHEEL_LOCATION=https://github.com/vllm-project/vllm/releases/download/v0.16.0/vllm-0.16.0-cp38-abi3-manylinux_2_31_x86_64.whl -uv pip install -e . -cd ../vllm-omni -uv pip install -e . -``` - - - -```bash -# edit /lustre/users/rkoshkin/vllm-omni/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml AS NECESSARY -cd examples/online_serving/qwen3_tts -./run_server.sh Base -``` diff --git a/examples/offline_inference/qwen3_tts/collect_env.py b/examples/offline_inference/qwen3_tts/collect_env.py deleted file mode 100644 index 8b09379e1a..0000000000 --- a/examples/offline_inference/qwen3_tts/collect_env.py +++ /dev/null @@ -1,760 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# ruff: noqa -# code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py - -import datetime -import locale -import os -import subprocess -import sys - -# Unlike the rest of the PyTorch this file must be python2 compliant. -# This script outputs relevant system environment info -# Run it with `python collect_env.py` or `python -m torch.utils.collect_env` -from collections import namedtuple - -import regex as re - -from vllm.envs import environment_variables - -try: - import torch - - TORCH_AVAILABLE = True -except (ImportError, NameError, AttributeError, OSError): - TORCH_AVAILABLE = False - -# System Environment Information -SystemEnv = namedtuple( - "SystemEnv", - [ - "torch_version", - "is_debug_build", - "cuda_compiled_version", - "gcc_version", - "clang_version", - "cmake_version", - "os", - "libc_version", - "python_version", - "python_platform", - "is_cuda_available", - "cuda_runtime_version", - "cuda_module_loading", - "nvidia_driver_version", - "nvidia_gpu_models", - "cudnn_version", - "pip_version", # 'pip' or 'pip3' - "pip_packages", - "conda_packages", - "hip_compiled_version", - "hip_runtime_version", - "miopen_runtime_version", - "caching_allocator_config", - "is_xnnpack_available", - "cpu_info", - "rocm_version", # vllm specific field - "vllm_version", # vllm specific field - "vllm_omni_version", # vllm-omni specific field - "vllm_build_flags", # vllm specific field - "gpu_topo", # vllm specific field - "env_vars", - ], -) - -DEFAULT_CONDA_PATTERNS = { - "torch", - "numpy", - "cudatoolkit", - "soumith", - "mkl", - "magma", - "triton", - "optree", - "nccl", - "transformers", - "zmq", - "nvidia", - "pynvml", - "flashinfer-python", -} - -DEFAULT_PIP_PATTERNS = { - "torch", - "numpy", - "mypy", - "flake8", - "triton", - "optree", - "onnx", - "nccl", - "transformers", - "zmq", - "nvidia", - "pynvml", - "flashinfer-python", -} - - -def run(command): - """Return (return-code, stdout, stderr).""" - shell = True if type(command) is str else False - try: - p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell) - raw_output, raw_err = p.communicate() - rc = p.returncode - if get_platform() == "win32": - enc = "oem" - else: - enc = locale.getpreferredencoding() - output = raw_output.decode(enc) - if command == "nvidia-smi topo -m": - # don't remove the leading whitespace of `nvidia-smi topo -m` - # because they are meaningful - output = output.rstrip() - else: - output = output.strip() - err = raw_err.decode(enc) - return rc, output, err.strip() - - except FileNotFoundError: - cmd_str = command if isinstance(command, str) else command[0] - return 127, "", f"Command not found: {cmd_str}" - - -def run_and_read_all(run_lambda, command): - """Run command using run_lambda; reads and returns entire output if rc is 0.""" - rc, out, _ = run_lambda(command) - if rc != 0: - return None - return out - - -def run_and_parse_first_match(run_lambda, command, regex): - """Run command using run_lambda, returns the first regex match if it exists.""" - rc, out, _ = run_lambda(command) - if rc != 0: - return None - match = re.search(regex, out) - if match is None: - return None - return match.group(1) - - -def run_and_return_first_line(run_lambda, command): - """Run command using run_lambda and returns first line if output is not empty.""" - rc, out, _ = run_lambda(command) - if rc != 0: - return None - return out.split("\n")[0] - - -def get_conda_packages(run_lambda, patterns=None): - if patterns is None: - patterns = DEFAULT_CONDA_PATTERNS - conda = os.environ.get("CONDA_EXE", "conda") - out = run_and_read_all(run_lambda, [conda, "list"]) - if out is None: - return out - - return "\n".join( - line for line in out.splitlines() if not line.startswith("#") and any(name in line for name in patterns) - ) - - -def get_gcc_version(run_lambda): - return run_and_parse_first_match(run_lambda, "gcc --version", r"gcc (.*)") - - -def get_clang_version(run_lambda): - return run_and_parse_first_match(run_lambda, "clang --version", r"clang version (.*)") - - -def get_cmake_version(run_lambda): - return run_and_parse_first_match(run_lambda, "cmake --version", r"cmake (.*)") - - -def get_nvidia_driver_version(run_lambda): - if get_platform() == "darwin": - cmd = "kextstat | grep -i cuda" - return run_and_parse_first_match(run_lambda, cmd, r"com[.]nvidia[.]CUDA [(](.*?)[)]") - smi = get_nvidia_smi() - return run_and_parse_first_match(run_lambda, smi, r"Driver Version: (.*?) ") - - -def get_gpu_info(run_lambda): - if get_platform() == "darwin" or ( - TORCH_AVAILABLE and hasattr(torch.version, "hip") and torch.version.hip is not None - ): - if TORCH_AVAILABLE and torch.cuda.is_available(): - if torch.version.hip is not None: - prop = torch.cuda.get_device_properties(0) - if hasattr(prop, "gcnArchName"): - gcnArch = " ({})".format(prop.gcnArchName) - else: - gcnArch = "NoGCNArchNameOnOldPyTorch" - else: - gcnArch = "" - return torch.cuda.get_device_name(None) + gcnArch - return None - smi = get_nvidia_smi() - uuid_regex = re.compile(r" \(UUID: .+?\)") - rc, out, _ = run_lambda(smi + " -L") - if rc != 0: - return None - # Anonymize GPUs by removing their UUID - return re.sub(uuid_regex, "", out) - - -def get_running_cuda_version(run_lambda): - return run_and_parse_first_match(run_lambda, "nvcc --version", r"release .+ V(.*)") - - -def get_cudnn_version(run_lambda): - """Return a list of libcudnn.so; it's hard to tell which one is being used.""" - if get_platform() == "win32": - system_root = os.environ.get("SYSTEMROOT", "C:\\Windows") - cuda_path = os.environ.get("CUDA_PATH", "%CUDA_PATH%") - where_cmd = os.path.join(system_root, "System32", "where") - cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path) - elif get_platform() == "darwin": - # CUDA libraries and drivers can be found in /usr/local/cuda/. See - # https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#install - # https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installmac - # Use CUDNN_LIBRARY when cudnn library is installed elsewhere. - cudnn_cmd = "ls /usr/local/cuda/lib/libcudnn*" - else: - cudnn_cmd = 'ldconfig -p | grep libcudnn | rev | cut -d" " -f1 | rev' - rc, out, _ = run_lambda(cudnn_cmd) - # find will return 1 if there are permission errors or if not found - if len(out) == 0 or (rc != 1 and rc != 0): - l = os.environ.get("CUDNN_LIBRARY") - if l is not None and os.path.isfile(l): - return os.path.realpath(l) - return None - files_set = set() - for fn in out.split("\n"): - fn = os.path.realpath(fn) # eliminate symbolic links - if os.path.isfile(fn): - files_set.add(fn) - if not files_set: - return None - # Alphabetize the result because the order is non-deterministic otherwise - files = sorted(files_set) - if len(files) == 1: - return files[0] - result = "\n".join(files) - return "Probably one of the following:\n{}".format(result) - - -def get_nvidia_smi(): - # Note: nvidia-smi is currently available only on Windows and Linux - smi = "nvidia-smi" - if get_platform() == "win32": - system_root = os.environ.get("SYSTEMROOT", "C:\\Windows") - program_files_root = os.environ.get("PROGRAMFILES", "C:\\Program Files") - legacy_path = os.path.join(program_files_root, "NVIDIA Corporation", "NVSMI", smi) - new_path = os.path.join(system_root, "System32", smi) - smis = [new_path, legacy_path] - for candidate_smi in smis: - if os.path.exists(candidate_smi): - smi = '"{}"'.format(candidate_smi) - break - return smi - - -def get_rocm_version(run_lambda): - """Returns the ROCm version if available, otherwise 'N/A'.""" - return run_and_parse_first_match(run_lambda, "hipcc --version", r"HIP version: (\S+)") - - -def get_vllm_version(): - from vllm import __version__, __version_tuple__ - - if __version__ == "dev": - return "N/A (dev)" - version_str = __version_tuple__[-1] - if isinstance(version_str, str) and version_str.startswith("g"): - # it's a dev build - if "." in version_str: - # it's a dev build containing local changes - git_sha = version_str.split(".")[0][1:] - date = version_str.split(".")[-1][1:] - return f"{__version__} (git sha: {git_sha}, date: {date})" - else: - # it's a dev build without local changes - git_sha = version_str[1:] # type: ignore - return f"{__version__} (git sha: {git_sha})" - return __version__ - - -def get_vllm_omni_version(run_lambda): - try: - import vllm_omni - from vllm_omni import __version__, __version_tuple__ - - version_str = __version_tuple__[-1] - if isinstance(version_str, str) and version_str.startswith("g"): - if "." in version_str: - git_sha = version_str.split(".")[0][1:] - date = version_str.split(".")[-1][1:] - return f"{__version__} (git sha: {git_sha}, date: {date})" - else: - git_sha = version_str[1:] - return f"{__version__} (git sha: {git_sha})" - - package_dir = os.path.dirname(os.path.abspath(vllm_omni.__file__)) - git_sha = run_and_read_all(run_lambda, f"git -C {package_dir} rev-parse --short HEAD") - if git_sha: - return f"{__version__} (git sha: {git_sha})" - - return __version__ - except ImportError: - return "N/A (vllm_omni not installed)" - - -def summarize_vllm_build_flags(): - # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc. - return "CUDA Archs: {}; ROCm: {}".format( - os.environ.get("TORCH_CUDA_ARCH_LIST", "Not Set"), - "Enabled" if os.environ.get("ROCM_HOME") else "Disabled", - ) - - -def get_gpu_topo(run_lambda): - output = None - - if get_platform() == "linux": - output = run_and_read_all(run_lambda, "nvidia-smi topo -m") - if output is None: - output = run_and_read_all(run_lambda, "rocm-smi --showtopo") - - return output - - -def get_cpu_info(run_lambda): - rc, out, err = 0, "", "" - if get_platform() == "linux": - rc, out, err = run_lambda("lscpu") - elif get_platform() == "win32": - rc, out, err = run_lambda( - "wmic cpu get Name,Manufacturer,Family,Architecture,ProcessorType,DeviceID, \ - CurrentClockSpeed,MaxClockSpeed,L2CacheSize,L2CacheSpeed,Revision /VALUE" - ) - elif get_platform() == "darwin": - rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string") - cpu_info = "None" - if rc == 0: - cpu_info = out - else: - cpu_info = err - return cpu_info - - -def get_platform(): - if sys.platform.startswith("linux"): - return "linux" - elif sys.platform.startswith("win32"): - return "win32" - elif sys.platform.startswith("cygwin"): - return "cygwin" - elif sys.platform.startswith("darwin"): - return "darwin" - else: - return sys.platform - - -def get_mac_version(run_lambda): - return run_and_parse_first_match(run_lambda, "sw_vers -productVersion", r"(.*)") - - -def get_windows_version(run_lambda): - system_root = os.environ.get("SYSTEMROOT", "C:\\Windows") - wmic_cmd = os.path.join(system_root, "System32", "Wbem", "wmic") - findstr_cmd = os.path.join(system_root, "System32", "findstr") - return run_and_read_all(run_lambda, "{} os get Caption | {} /v Caption".format(wmic_cmd, findstr_cmd)) - - -def get_lsb_version(run_lambda): - return run_and_parse_first_match(run_lambda, "lsb_release -a", r"Description:\t(.*)") - - -def check_release_file(run_lambda): - return run_and_parse_first_match(run_lambda, "cat /etc/*-release", r'PRETTY_NAME="(.*)"') - - -def get_os(run_lambda): - from platform import machine - - platform = get_platform() - - if platform == "win32" or platform == "cygwin": - return get_windows_version(run_lambda) - - if platform == "darwin": - version = get_mac_version(run_lambda) - if version is None: - return None - return "macOS {} ({})".format(version, machine()) - - if platform == "linux": - # Ubuntu/Debian based - desc = get_lsb_version(run_lambda) - if desc is not None: - return "{} ({})".format(desc, machine()) - - # Try reading /etc/*-release - desc = check_release_file(run_lambda) - if desc is not None: - return "{} ({})".format(desc, machine()) - - return "{} ({})".format(platform, machine()) - - # Unknown platform - return platform - - -def get_python_platform(): - import platform - - return platform.platform() - - -def get_libc_version(): - import platform - - if get_platform() != "linux": - return "N/A" - return "-".join(platform.libc_ver()) - - -def is_uv_venv(): - if os.environ.get("UV"): - return True - pyvenv_cfg_path = os.path.join(sys.prefix, "pyvenv.cfg") - if os.path.exists(pyvenv_cfg_path): - with open(pyvenv_cfg_path, "r") as f: - return any(line.startswith("uv = ") for line in f) - return False - - -def get_pip_packages(run_lambda, patterns=None): - """Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages.""" - if patterns is None: - patterns = DEFAULT_PIP_PATTERNS - - def run_with_pip(): - try: - import importlib.util - - pip_spec = importlib.util.find_spec("pip") - pip_available = pip_spec is not None - except ImportError: - pip_available = False - - if pip_available: - cmd = [sys.executable, "-mpip", "list", "--format=freeze"] - elif is_uv_venv(): - print("uv is set") - cmd = ["uv", "pip", "list", "--format=freeze"] - else: - raise RuntimeError("Could not collect pip list output (pip or uv module not available)") - - out = run_and_read_all(run_lambda, cmd) - return "\n".join(line for line in out.splitlines() if any(name in line for name in patterns)) - - pip_version = "pip3" if sys.version[0] == "3" else "pip" - out = run_with_pip() - return pip_version, out - - -def get_cachingallocator_config(): - ca_config = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "") - return ca_config - - -def get_cuda_module_loading_config(): - if TORCH_AVAILABLE and torch.cuda.is_available(): - torch.cuda.init() - config = os.environ.get("CUDA_MODULE_LOADING", "") - return config - else: - return "N/A" - - -def is_xnnpack_available(): - if TORCH_AVAILABLE: - import torch.backends.xnnpack - - return str(torch.backends.xnnpack.enabled) # type: ignore[attr-defined] - else: - return "N/A" - - -def get_env_vars(): - env_vars = "" - secret_terms = ("secret", "token", "api", "access", "password") - report_prefix = ( - "TORCH", - "NCCL", - "PYTORCH", - "CUDA", - "CUBLAS", - "CUDNN", - "OMP_", - "MKL_", - "NVIDIA", - ) - for k, v in os.environ.items(): - if any(term in k.lower() for term in secret_terms): - continue - if k in environment_variables: - env_vars = env_vars + "{}={}".format(k, v) + "\n" - if k.startswith(report_prefix): - env_vars = env_vars + "{}={}".format(k, v) + "\n" - - return env_vars - - -def get_env_info(): - run_lambda = run - pip_version, pip_list_output = get_pip_packages(run_lambda) - - if TORCH_AVAILABLE: - version_str = torch.__version__ - debug_mode_str = str(torch.version.debug) - cuda_available_str = str(torch.cuda.is_available()) - cuda_version_str = torch.version.cuda - if not hasattr(torch.version, "hip") or torch.version.hip is None: # cuda version - hip_compiled_version = hip_runtime_version = miopen_runtime_version = "N/A" - else: # HIP version - - def get_version_or_na(cfg, prefix): - _lst = [s.rsplit(None, 1)[-1] for s in cfg if prefix in s] - return _lst[0] if _lst else "N/A" - - cfg = torch._C._show_config().split("\n") - hip_runtime_version = get_version_or_na(cfg, "HIP Runtime") - miopen_runtime_version = get_version_or_na(cfg, "MIOpen") - cuda_version_str = "N/A" - hip_compiled_version = torch.version.hip - else: - version_str = debug_mode_str = cuda_available_str = cuda_version_str = "N/A" - hip_compiled_version = hip_runtime_version = miopen_runtime_version = "N/A" - - sys_version = sys.version.replace("\n", " ") - - conda_packages = get_conda_packages(run_lambda) - - rocm_version = get_rocm_version(run_lambda) - vllm_version = get_vllm_version() - vllm_omni_version = get_vllm_omni_version(run_lambda) - vllm_build_flags = summarize_vllm_build_flags() - gpu_topo = get_gpu_topo(run_lambda) - - return SystemEnv( - torch_version=version_str, - is_debug_build=debug_mode_str, - python_version="{} ({}-bit runtime)".format(sys_version, sys.maxsize.bit_length() + 1), - python_platform=get_python_platform(), - is_cuda_available=cuda_available_str, - cuda_compiled_version=cuda_version_str, - cuda_runtime_version=get_running_cuda_version(run_lambda), - cuda_module_loading=get_cuda_module_loading_config(), - nvidia_gpu_models=get_gpu_info(run_lambda), - nvidia_driver_version=get_nvidia_driver_version(run_lambda), - cudnn_version=get_cudnn_version(run_lambda), - hip_compiled_version=hip_compiled_version, - hip_runtime_version=hip_runtime_version, - miopen_runtime_version=miopen_runtime_version, - pip_version=pip_version, - pip_packages=pip_list_output, - conda_packages=conda_packages, - os=get_os(run_lambda), - libc_version=get_libc_version(), - gcc_version=get_gcc_version(run_lambda), - clang_version=get_clang_version(run_lambda), - cmake_version=get_cmake_version(run_lambda), - caching_allocator_config=get_cachingallocator_config(), - is_xnnpack_available=is_xnnpack_available(), - cpu_info=get_cpu_info(run_lambda), - rocm_version=rocm_version, - vllm_version=vllm_version, - vllm_omni_version=vllm_omni_version, - vllm_build_flags=vllm_build_flags, - gpu_topo=gpu_topo, - env_vars=get_env_vars(), - ) - - -env_info_fmt = """ -============================== - System Info -============================== -OS : {os} -GCC version : {gcc_version} -Clang version : {clang_version} -CMake version : {cmake_version} -Libc version : {libc_version} - -============================== - PyTorch Info -============================== -PyTorch version : {torch_version} -Is debug build : {is_debug_build} -CUDA used to build PyTorch : {cuda_compiled_version} -ROCM used to build PyTorch : {hip_compiled_version} - -============================== - Python Environment -============================== -Python version : {python_version} -Python platform : {python_platform} - -============================== - CUDA / GPU Info -============================== -Is CUDA available : {is_cuda_available} -CUDA runtime version : {cuda_runtime_version} -CUDA_MODULE_LOADING set to : {cuda_module_loading} -GPU models and configuration : {nvidia_gpu_models} -Nvidia driver version : {nvidia_driver_version} -cuDNN version : {cudnn_version} -HIP runtime version : {hip_runtime_version} -MIOpen runtime version : {miopen_runtime_version} -Is XNNPACK available : {is_xnnpack_available} - -============================== - CPU Info -============================== -{cpu_info} - -============================== -Versions of relevant libraries -============================== -{pip_packages} -{conda_packages} -""".strip() - -# both the above code and the following code use `strip()` to -# remove leading/trailing whitespaces, so we need to add a newline -# in between to separate the two sections -env_info_fmt += "\n\n" - -env_info_fmt += """ -============================== - vLLM Info -============================== -ROCM Version : {rocm_version} -vLLM Version : {vllm_version} -vLLM-Omni Version : {vllm_omni_version} -vLLM Build Flags: - {vllm_build_flags} -GPU Topology: - {gpu_topo} - -============================== - Environment Variables -============================== -{env_vars} -""".strip() - - -def pretty_str(envinfo): - def replace_nones(dct, replacement="Could not collect"): - for key in dct.keys(): - if dct[key] is not None: - continue - dct[key] = replacement - return dct - - def replace_bools(dct, true="Yes", false="No"): - for key in dct.keys(): - if dct[key] is True: - dct[key] = true - elif dct[key] is False: - dct[key] = false - return dct - - def prepend(text, tag="[prepend]"): - lines = text.split("\n") - updated_lines = [tag + line for line in lines] - return "\n".join(updated_lines) - - def replace_if_empty(text, replacement="No relevant packages"): - if text is not None and len(text) == 0: - return replacement - return text - - def maybe_start_on_next_line(string): - # If `string` is multiline, prepend a \n to it. - if string is not None and len(string.split("\n")) > 1: - return "\n{}\n".format(string) - return string - - mutable_dict = envinfo._asdict() - - # If nvidia_gpu_models is multiline, start on the next line - mutable_dict["nvidia_gpu_models"] = maybe_start_on_next_line(envinfo.nvidia_gpu_models) - - # If the machine doesn't have CUDA, report some fields as 'No CUDA' - dynamic_cuda_fields = [ - "cuda_runtime_version", - "nvidia_gpu_models", - "nvidia_driver_version", - ] - all_cuda_fields = dynamic_cuda_fields + ["cudnn_version"] - all_dynamic_cuda_fields_missing = all(mutable_dict[field] is None for field in dynamic_cuda_fields) - if TORCH_AVAILABLE and not torch.cuda.is_available() and all_dynamic_cuda_fields_missing: - for field in all_cuda_fields: - mutable_dict[field] = "No CUDA" - if envinfo.cuda_compiled_version is None: - mutable_dict["cuda_compiled_version"] = "None" - - # Replace True with Yes, False with No - mutable_dict = replace_bools(mutable_dict) - - # Replace all None objects with 'Could not collect' - mutable_dict = replace_nones(mutable_dict) - - # If either of these are '', replace with 'No relevant packages' - mutable_dict["pip_packages"] = replace_if_empty(mutable_dict["pip_packages"]) - mutable_dict["conda_packages"] = replace_if_empty(mutable_dict["conda_packages"]) - - # Tag conda and pip packages with a prefix - # If they were previously None, they'll show up as ie '[conda] Could not collect' - if mutable_dict["pip_packages"]: - mutable_dict["pip_packages"] = prepend(mutable_dict["pip_packages"], "[{}] ".format(envinfo.pip_version)) - if mutable_dict["conda_packages"]: - mutable_dict["conda_packages"] = prepend(mutable_dict["conda_packages"], "[conda] ") - mutable_dict["cpu_info"] = envinfo.cpu_info - return env_info_fmt.format(**mutable_dict) - - -def get_pretty_env_info(): - return pretty_str(get_env_info()) - - -def main(): - print("Collecting environment information...") - output = get_pretty_env_info() - print(output) - - if TORCH_AVAILABLE and hasattr(torch, "utils") and hasattr(torch.utils, "_crash_handler"): - minidump_dir = torch.utils._crash_handler.DEFAULT_MINIDUMP_DIR - if sys.platform == "linux" and os.path.exists(minidump_dir): - dumps = [os.path.join(minidump_dir, dump) for dump in os.listdir(minidump_dir)] - latest = max(dumps, key=os.path.getctime) - ctime = os.path.getctime(latest) - creation_time = datetime.datetime.fromtimestamp(ctime).strftime("%Y-%m-%d %H:%M:%S") - msg = ( - "\n*** Detected a minidump at {} created on {}, ".format(latest, creation_time) - + "if this is related to your bug please include it when you file a report ***" - ) - print(msg, file=sys.stderr) - - -if __name__ == "__main__": - main() diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py index 5dfcdc542f..9b8413d0a2 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py +++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py @@ -88,6 +88,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # Store vllm_config for potential future use self.vllm_config = vllm_config + @staticmethod + def extract_val(d, key, default): + val = d.get(key, default) + if isinstance(val, list): + return val[0] if len(val) > 0 else default + return val + def forward( self, input_ids: torch.Tensor | None = None, @@ -115,18 +122,11 @@ def forward( batched_keys = {"ref_audio", "ref_text", "x_vector_only_mode", "voice_clone_prompt"} for req_info in runtime_info_list: - - def extract_val(d, key, default): - val = d.get(key, default) - if isinstance(val, list): - return val[0] if len(val) > 0 else default - return val - - texts.append(extract_val(req_info, "text", "")) - task_types.append(extract_val(req_info, "task_type", self.task_type)) - speakers.append(extract_val(req_info, "speaker", "uncle_fu")) - languages.append(extract_val(req_info, "language", "Auto")) - instructs.append(extract_val(req_info, "instruct", "")) + texts.append(self.extract_val(req_info, "text", "")) + task_types.append(self.extract_val(req_info, "task_type", self.task_type)) + speakers.append(self.extract_val(req_info, "speaker", "uncle_fu")) + languages.append(self.extract_val(req_info, "language", "Auto")) + instructs.append(self.extract_val(req_info, "instruct", "")) for k, v in req_info.items(): if k not in ["text", "task_type", "speaker", "language", "instruct"]: diff --git a/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml b/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml index 09cf2a4ccc..4fcd8c38ef 100644 --- a/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml +++ b/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml @@ -3,7 +3,7 @@ stage_args: stage_type: llm # Use llm stage type to launch OmniLLM runtime: devices: "0" - max_batch_size: 100 + max_batch_size: 10 engine_args: model_stage: qwen3_tts model_arch: Qwen3TTSForConditionalGeneration @@ -14,7 +14,7 @@ stage_args: async_scheduling: false enable_prefix_caching: false engine_output_type: audio # Final output: audio waveform - gpu_memory_utilization: 0.8 + gpu_memory_utilization: 0.5 distributed_executor_backend: "mp" max_num_batched_tokens: 1000000 From b8daf11e6c185fedbcd69bee8fddd240d921f351 Mon Sep 17 00:00:00 2001 From: Roman Koshkin Date: Sat, 21 Feb 2026 16:22:36 +0900 Subject: [PATCH 3/4] [Bugfix] [cleanup2](qwen3_tts): enable batched offline inference by fixing tensor slicing Signed-off-by: Roman Koshkin --- vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py index 9b8413d0a2..9a0a993a60 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py +++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py @@ -149,6 +149,8 @@ def forward( merged_kwargs["max_new_tokens"] = 2 # Assume uniform task type across the batch + if len(set(task_types)) > 1: + raise ValueError(f"Mixed task types not supported: {set(task_types)}") task_type = task_types[0] # Call the appropriate generation method based on task_type, passing lists From 8b80aeb753c65ae4f5e0659aecc4b81dcdc01d90 Mon Sep 17 00:00:00 2001 From: Roman Koshkin Date: Sat, 21 Feb 2026 16:29:44 +0900 Subject: [PATCH 4/4] [Bugfix] [cleanup3](qwen3_tts): enable batched offline inference by fixing tensor slicing Signed-off-by: Roman Koshkin --- .gitignore | 2 -- 1 file changed, 2 deletions(-) diff --git a/.gitignore b/.gitignore index 214c8efb42..12486f4a7f 100644 --- a/.gitignore +++ b/.gitignore @@ -245,5 +245,3 @@ tmp_test # output files *.wav -examples/offline_inference/qwen3_tts/test.py -examples/online_serving/qwen3_tts/Untitled.ipynb