Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions components/src/dynamo/vllm/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ class Config:
multimodal_processor: bool = False
multimodal_encode_worker: bool = False
multimodal_worker: bool = False
multimodal_decode_worker: bool = False
multimodal_encode_prefill_worker: bool = False
mm_prompt_template: str = "USER: <image>\n<prompt> ASSISTANT:"
# dump config to file
Expand Down Expand Up @@ -170,6 +171,11 @@ def parse_args() -> Config:
action="store_true",
help="Run as multimodal worker component for LLM inference with multimodal data",
)
parser.add_argument(
"--multimodal-decode-worker",
action="store_true",
help="Run as multimodal decode worker in disaggregated mode",
)
parser.add_argument(
"--multimodal-encode-prefill-worker",
action="store_true",
Expand Down Expand Up @@ -218,11 +224,12 @@ def parse_args() -> Config:
int(bool(args.multimodal_processor))
+ int(bool(args.multimodal_encode_worker))
+ int(bool(args.multimodal_worker))
+ int(bool(args.multimodal_decode_worker))
+ int(bool(args.multimodal_encode_prefill_worker))
)
if mm_flags > 1:
raise ValueError(
"Use only one of --multimodal-processor, --multimodal-encode-worker, --multimodal-worker, or --multimodal-encode-prefill-worker"
"Use only one of --multimodal-processor, --multimodal-encode-worker, --multimodal-worker, --multimodal-decode-worker, or --multimodal-encode-prefill-worker"
)

# Set component and endpoint based on worker type
Expand All @@ -235,8 +242,15 @@ def parse_args() -> Config:
elif args.multimodal_encode_prefill_worker:
config.component = "encoder"
config.endpoint = "generate"
elif args.multimodal_decode_worker:
# Multimodal decode worker in disaggregated mode
# Uses "decoder" component name because prefill worker connects to "decoder"
# (prefill uses "backend" to receive from encoder)
config.component = "decoder"
config.endpoint = "generate"
elif args.multimodal_worker and args.is_prefill_worker:
config.component = "prefill"
# Multimodal prefill worker stays as "backend" to maintain encoder connection
config.component = "backend"
config.endpoint = "generate"
elif args.is_prefill_worker:
config.component = "prefill"
Expand All @@ -258,6 +272,7 @@ def parse_args() -> Config:
config.multimodal_processor = args.multimodal_processor
config.multimodal_encode_worker = args.multimodal_encode_worker
config.multimodal_worker = args.multimodal_worker
config.multimodal_decode_worker = args.multimodal_decode_worker
config.multimodal_encode_prefill_worker = args.multimodal_encode_prefill_worker
config.mm_prompt_template = args.mm_prompt_template

Expand Down
35 changes: 27 additions & 8 deletions components/src/dynamo/vllm/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from dynamo.runtime.logging import configure_dynamo_logging
from dynamo.vllm.multimodal_handlers import (
EncodeWorkerHandler,
MultimodalDecodeWorkerHandler,
MultimodalPDWorkerHandler,
ProcessorHandler,
)
Expand Down Expand Up @@ -105,7 +106,11 @@ def signal_handler():
elif config.multimodal_encode_worker:
await init_multimodal_encode_worker(runtime, config)
logger.debug("init_multimodal_encode_worker completed")
elif config.multimodal_worker or config.multimodal_encode_prefill_worker:
elif (
config.multimodal_worker
or config.multimodal_decode_worker
or config.multimodal_encode_prefill_worker
):
await init_multimodal_worker(runtime, config)
logger.debug("init_multimodal_worker completed")
elif config.is_prefill_worker:
Expand All @@ -129,7 +134,6 @@ def setup_kv_event_publisher(
"""
Set up KV event publishers for prefix caching if enabled.
Creates one publisher per dp_rank since each dp_rank publishes to a different port.

Args:
config: Worker configuration
component: Component for runtime integration
Expand Down Expand Up @@ -632,13 +636,28 @@ async def init_multimodal_worker(runtime: DistributedRuntime, config: Config):

engine_client, vllm_config, default_sampling_params = setup_vllm_engine(config)

# For aggregated mode, no downstream client is needed
# TODO: Implement disaggregated mode with proper decode worker client
downstream_client = None
# Set up decode worker client for disaggregated mode
decode_worker_client = None
if config.is_prefill_worker:
# Prefill worker needs to connect to decode worker
decode_worker_client = (
await runtime.namespace(config.namespace)
.component("decoder")
.endpoint("generate")
.client()
)
await decode_worker_client.wait_for_instances()
logger.info("Connected to decode worker for disaggregated mode")

handler = MultimodalPDWorkerHandler(
runtime, component, engine_client, config, downstream_client
)
# Choose handler based on worker type
if config.multimodal_decode_worker:
handler = MultimodalDecodeWorkerHandler(
runtime, component, engine_client, config
)
else:
handler = MultimodalPDWorkerHandler(
runtime, component, engine_client, config, decode_worker_client
)

await handler.async_init(runtime)

Expand Down
112 changes: 112 additions & 0 deletions examples/backends/vllm/launch/disagg_multimodal.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT

# Default values
MODEL_NAME="llava-hf/llava-1.5-7b-hf"
PROMPT_TEMPLATE="USER: <image>\n<prompt> ASSISTANT:"
PROVIDED_PROMPT_TEMPLATE=""

# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--model)
MODEL_NAME=$2
shift 2
;;
--prompt-template)
PROVIDED_PROMPT_TEMPLATE=$2
shift 2
;;
-h|--help)
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Disaggregated multimodal serving with separate Encode/Prefill/Decode workers"
echo ""
echo "Options:"
echo " --model <model_name> Specify the VLM model to use (default: $MODEL_NAME)"
echo " --prompt-template <template> Specify the multi-modal prompt template to use"
echo " LLaVA 1.5 7B, Qwen2.5-VL, and Phi3V models have predefined templates"
echo " -h, --help Show this help message"
echo ""
echo "Examples:"
echo " $0 --model llava-hf/llava-1.5-7b-hf"
echo " $0 --model microsoft/Phi-3.5-vision-instruct"
echo " $0 --model Qwen/Qwen2.5-VL-7B-Instruct"
echo ""
exit 0
;;
*)
echo "Unknown option: $1"
echo "Use --help for usage information"
exit 1
;;
esac
done

# Set PROMPT_TEMPLATE based on the MODEL_NAME
if [[ -n "$PROVIDED_PROMPT_TEMPLATE" ]]; then
PROMPT_TEMPLATE="$PROVIDED_PROMPT_TEMPLATE"
elif [[ "$MODEL_NAME" == "llava-hf/llava-1.5-7b-hf" ]]; then
PROMPT_TEMPLATE="USER: <image>\n<prompt> ASSISTANT:"
elif [[ "$MODEL_NAME" == "microsoft/Phi-3.5-vision-instruct" ]]; then
PROMPT_TEMPLATE="<|user|>\n<|image_1|>\n<prompt><|end|>\n<|assistant|>\n"
elif [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then
PROMPT_TEMPLATE="<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|><prompt><|im_end|>\n<|im_start|>assistant\n"
else
echo "No multi-modal prompt template is defined for the model: $MODEL_NAME"
echo "Please provide a prompt template using --prompt-template option."
echo "Example: --prompt-template 'USER: <image>\n<prompt> ASSISTANT:'"
exit 1
fi

echo "=================================================="
echo "Disaggregated Multimodal Serving"
echo "=================================================="
echo "Model: $MODEL_NAME"
echo "Prompt Template: $PROMPT_TEMPLATE"
echo "=================================================="

# Configure UCX for local same-machine communication
# Use shared memory (sm), self, and TCP transports instead of InfiniBand
# This prevents NIXL_ERR_BACKEND errors when running all workers on same machine
export UCX_TLS=sm,self,tcp

echo "UCX Transport: $UCX_TLS (local same-machine mode)"
echo "=================================================="

# Start frontend (no router mode)
echo "Starting frontend..."
python -m dynamo.frontend --http-port=8000 &

# Start processor
echo "Starting processor..."
python -m dynamo.vllm --multimodal-processor --model $MODEL_NAME --mm-prompt-template "$PROMPT_TEMPLATE" &

# Configure GPU memory optimization for specific models
EXTRA_ARGS=""
if [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then
EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 2048"
fi

# Start encode worker
echo "Starting encode worker on GPU 1..."
CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-encode-worker --model $MODEL_NAME --enforce-eager $EXTRA_ARGS &

# Start prefill worker
echo "Starting prefill worker on GPU 2..."
CUDA_VISIBLE_DEVICES=2 python -m dynamo.vllm --multimodal-worker --is-prefill-worker --model $MODEL_NAME --enforce-eager --connector nixl $EXTRA_ARGS &

# Start decode worker
echo "Starting decode worker on GPU 3..."
CUDA_VISIBLE_DEVICES=3 python -m dynamo.vllm --multimodal-decode-worker --model $MODEL_NAME --enforce-eager --connector nixl $EXTRA_ARGS &

echo "=================================================="
echo "All components started. Waiting for initialization..."
echo "=================================================="

# Wait for all background processes to complete
wait

Loading