Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 61 additions & 4 deletions examples/offline_inference/qwen3_omni/end2end.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from vllm.multimodal.image import convert_image_mode
from vllm.utils.argparse_utils import FlexibleArgumentParser

# Import Omni engine
from vllm_omni.entrypoints.omni import Omni

SEED = 42
Expand Down Expand Up @@ -294,11 +295,30 @@ def main(args):
else:
query_result = query_func()

# Build kwargs with CLI overrides.
# Global params (e.g. --gpu-memory-utilization) apply to all stages;
# per-stage overrides (--stage-N-*) take precedence when specified.
omni_kwargs = {
"stage_configs_path": args.stage_configs_path,
"log_stats": args.log_stats,
"stage_init_timeout": args.stage_init_timeout,
}

# Add CLI overrides if specified
if args.gpu_memory_utilization is not None:
omni_kwargs["gpu_memory_utilization"] = args.gpu_memory_utilization
if args.tensor_parallel_size is not None:
omni_kwargs["tensor_parallel_size"] = args.tensor_parallel_size
if args.devices is not None:
omni_kwargs["devices"] = args.devices
if args.enforce_eager:
omni_kwargs["enforce_eager"] = args.enforce_eager
if args.trust_remote_code:
omni_kwargs["trust_remote_code"] = args.trust_remote_code

omni_llm = Omni(
model=model_name,
stage_configs_path=args.stage_configs_path,
log_stats=args.log_stats,
stage_init_timeout=args.stage_init_timeout,
**omni_kwargs,
)

thinker_sampling_params = SamplingParams(
Expand Down Expand Up @@ -458,6 +478,12 @@ def parse_args():
default="output_audio",
help="[Deprecated] Output wav directory (use --output-dir).",
)
parser.add_argument(
"--output-dir",
type=str,
default=None,
help="Output directory for generated files (text and audio).",
)
parser.add_argument(
"--num-prompts",
type=int,
Expand All @@ -474,7 +500,38 @@ def parse_args():
"--stage-configs-path",
type=str,
default=None,
help="Path to a stage configs file.",
help="Path to a stage configs file. If not specified, auto-detected from model.",
)
# CLI override arguments
parser.add_argument(
"--gpu-memory-utilization",
type=float,
default=None,
help="GPU memory utilization for all stages (CLI override). Example: 0.9",
)
parser.add_argument(
"--tensor-parallel-size",
type=int,
default=None,
help="Tensor parallel size for all stages (CLI override). Example: 2",
)
Comment on lines 506 to 517
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So if these parameters are set individually now, it will apply to all stages?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right — global CLI params like --gpu-memory-utilization apply to every stage by default. Per-stage overrides (--stage-N-*) take precedence when specified. Added a comment at the kwargs block to clarify this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, global args like --gpu-memory-utilization apply to all stages. Per-stage overrides (--stage-N-*) take precedence when specified — this will be wired up in [2/N].

parser.add_argument(
"--devices",
type=str,
default=None,
help="Device assignment for stages (CLI override). Example: '0,1'",
)
parser.add_argument(
"--enforce-eager",
action="store_true",
default=False,
help="Enforce eager mode for all stages (CLI override).",
)
parser.add_argument(
"--trust-remote-code",
action="store_true",
default=False,
help="Trust remote code for model loading (CLI override).",
)
parser.add_argument(
"--video-path",
Expand Down
Loading