ai-dynamo · ayushag-nv · Nov 6, 2025 · Nov 7, 2025 · Nov 7, 2025
@@ -69,6 +69,7 @@ class Config:
     multimodal_processor: bool = False
     multimodal_encode_worker: bool = False
     multimodal_worker: bool = False
+    multimodal_decode_worker: bool = False
     multimodal_encode_prefill_worker: bool = False
     mm_prompt_template: str = "USER: <image>\n<prompt> ASSISTANT:"
     # dump config to file
@@ -170,6 +171,11 @@ def parse_args() -> Config:
         action="store_true",
         help="Run as multimodal worker component for LLM inference with multimodal data",
     )
+    parser.add_argument(
+        "--multimodal-decode-worker",
+        action="store_true",
+        help="Run as multimodal decode worker in disaggregated mode",
+    )
     parser.add_argument(
         "--multimodal-encode-prefill-worker",
         action="store_true",
@@ -218,11 +224,12 @@ def parse_args() -> Config:
         int(bool(args.multimodal_processor))
         + int(bool(args.multimodal_encode_worker))
         + int(bool(args.multimodal_worker))
+        + int(bool(args.multimodal_decode_worker))
         + int(bool(args.multimodal_encode_prefill_worker))
     )
     if mm_flags > 1:
         raise ValueError(
-            "Use only one of --multimodal-processor, --multimodal-encode-worker, --multimodal-worker, or --multimodal-encode-prefill-worker"
+            "Use only one of --multimodal-processor, --multimodal-encode-worker, --multimodal-worker, --multimodal-decode-worker, or --multimodal-encode-prefill-worker"
         )
 
     # Set component and endpoint based on worker type
@@ -235,8 +242,15 @@ def parse_args() -> Config:
     elif args.multimodal_encode_prefill_worker:
         config.component = "encoder"
         config.endpoint = "generate"
+    elif args.multimodal_decode_worker:
+        # Multimodal decode worker in disaggregated mode
+        # Uses "decoder" component name because prefill worker connects to "decoder"
+        # (prefill uses "backend" to receive from encoder)
+        config.component = "decoder"
+        config.endpoint = "generate"
     elif args.multimodal_worker and args.is_prefill_worker:
-        config.component = "prefill"
+        # Multimodal prefill worker stays as "backend" to maintain encoder connection
+        config.component = "backend"
         config.endpoint = "generate"
     elif args.is_prefill_worker:
         config.component = "prefill"
@@ -258,6 +272,7 @@ def parse_args() -> Config:
     config.multimodal_processor = args.multimodal_processor
     config.multimodal_encode_worker = args.multimodal_encode_worker
     config.multimodal_worker = args.multimodal_worker
+    config.multimodal_decode_worker = args.multimodal_decode_worker
     config.multimodal_encode_prefill_worker = args.multimodal_encode_prefill_worker
     config.mm_prompt_template = args.mm_prompt_template
 

@@ -29,6 +29,7 @@
 from dynamo.runtime.logging import configure_dynamo_logging
 from dynamo.vllm.multimodal_handlers import (
     EncodeWorkerHandler,
+    MultimodalDecodeWorkerHandler,
     MultimodalPDWorkerHandler,
     ProcessorHandler,
 )
@@ -105,7 +106,11 @@ def signal_handler():
     elif config.multimodal_encode_worker:
         await init_multimodal_encode_worker(runtime, config)
         logger.debug("init_multimodal_encode_worker completed")
-    elif config.multimodal_worker or config.multimodal_encode_prefill_worker:
+    elif (
+        config.multimodal_worker
+        or config.multimodal_decode_worker
+        or config.multimodal_encode_prefill_worker
+    ):
         await init_multimodal_worker(runtime, config)
         logger.debug("init_multimodal_worker completed")
     elif config.is_prefill_worker:
@@ -129,7 +134,6 @@ def setup_kv_event_publisher(
     """
     Set up KV event publishers for prefix caching if enabled.
     Creates one publisher per dp_rank since each dp_rank publishes to a different port.
-
     Args:
         config: Worker configuration
         component: Component for runtime integration
@@ -632,13 +636,37 @@ async def init_multimodal_worker(runtime: DistributedRuntime, config: Config):
 
     engine_client, vllm_config, default_sampling_params = setup_vllm_engine(config)
 
-    # For aggregated mode, no downstream client is needed
-    # TODO: Implement disaggregated mode with proper decode worker client
-    downstream_client = None
+    """
+
+    is_prefill_worker: 
+    handler: MultimodalPDWorkerHandler with decode_worker_client
 
-    handler = MultimodalPDWorkerHandler(
-        runtime, component, engine_client, config, downstream_client
-    )
+    if-decode-worker:
+    handler: MultimodalDecodeWorkerHandler
+    """
+
+    # Set up decode worker client for disaggregated mode
+    decode_worker_client = None
+    if config.is_prefill_worker:
+        # Prefill worker needs to connect to decode worker
+        decode_worker_client = (
+            await runtime.namespace(config.namespace)
+            .component("decoder")
+            .endpoint("generate")
+            .client()
+        )
+        await decode_worker_client.wait_for_instances()
+        logger.info("Connected to decode worker for disaggregated mode")
+
+    # Choose handler based on worker type
+    if config.multimodal_decode_worker:
+        handler = MultimodalDecodeWorkerHandler(
+            runtime, component, engine_client, config
+        )
+    else:
+        handler = MultimodalPDWorkerHandler(
+            runtime, component, engine_client, config, decode_worker_client
+        )
 
     await handler.async_init(runtime)
 

diff --git a/components/src/dynamo/vllm/multimodal_handlers/worker_handler.py b/components/src/dynamo/vllm/multimodal_handlers/worker_handler.py
@@ -58,6 +58,10 @@ async def generate(self, request: vLLMMultimodalRequest, context):
                 request = vLLMMultimodalRequest.model_validate(request)
         logger.debug(f"Received decode request: {{ id: {request.request_id} }}.")
 
+        print(f"Decode Worker request: {request}")
+        print("#########################")
+        print("Decode Worker is Generating...")
+        print("#########################")
         # Decode worker doesn't process embeddings, so we pass None or empty tensor
         gen = self.engine_client.generate(
             prompt=TokensPrompt(
@@ -66,8 +70,16 @@ async def generate(self, request: vLLMMultimodalRequest, context):
             sampling_params=request.sampling_params,
             request_id=request.request_id,
         )
+
+        print("#########################")
+        print("Decode Worker generated tokens...")
+        print("#########################")
+
 
         async for response in gen:
+            print("#########################")
+            print(f"Decode Worker generated token response: {response}")
+            print("#########################")
             logger.debug(f"Response kv_transfer_params: {response.kv_transfer_params}")
             yield MyRequestOutput(
                 request_id=response.request_id,
@@ -200,6 +212,9 @@ async def generate(self, request: vLLMMultimodalRequest, context):
 
             logger.debug("Prefill request: %s", pd_request)
 
+
+        print("#########################")
+        print("Prefill Worker is Generating...")
         gen = self.engine_client.generate(
             prompt=TokensPrompt(
                 prompt_token_ids=pd_request.engine_prompt["prompt_token_ids"],
@@ -209,6 +224,9 @@ async def generate(self, request: vLLMMultimodalRequest, context):
             request_id=pd_request.request_id,
         )
 
+        print("#########################")
+        print("Prefill Worker generated tokens...")
+
         if self.enable_disagg and self.decode_worker_client:
             decode_request = copy.deepcopy(request)
             async for prefill_response in gen:

@@ -0,0 +1,109 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+set -e
+trap 'echo Cleaning up...; kill 0' EXIT
+
+# Default values
+MODEL_NAME="llava-hf/llava-1.5-7b-hf"
+PROMPT_TEMPLATE="USER: <image>\n<prompt> ASSISTANT:"
+PROVIDED_PROMPT_TEMPLATE=""
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model)
+            MODEL_NAME=$2
+            shift 2
+            ;;
+        --prompt-template)
+            PROVIDED_PROMPT_TEMPLATE=$2
+            shift 2
+            ;;
+        -h|--help)
+            echo "Usage: $0 [OPTIONS]"
+            echo ""
+            echo "Disaggregated multimodal serving with separate Encode/Prefill/Decode workers"
+            echo ""
+            echo "Options:"
+            echo "  --model <model_name>          Specify the VLM model to use (default: $MODEL_NAME)"
+            echo "  --prompt-template <template>  Specify the multi-modal prompt template to use"
+            echo "                                LLaVA 1.5 7B, Qwen2.5-VL, and Phi3V models have predefined templates"
+            echo "  -h, --help                    Show this help message"
+            echo ""
+            echo "Examples:"
+            echo "  $0 --model llava-hf/llava-1.5-7b-hf"
+            echo "  $0 --model microsoft/Phi-3.5-vision-instruct"
+            echo "  $0 --model Qwen/Qwen2.5-VL-7B-Instruct"
+            echo ""
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Use --help for usage information"
+            exit 1
+            ;;
+    esac
+done
+
+# Set PROMPT_TEMPLATE based on the MODEL_NAME
+if [[ -n "$PROVIDED_PROMPT_TEMPLATE" ]]; then
+    PROMPT_TEMPLATE="$PROVIDED_PROMPT_TEMPLATE"
+elif [[ "$MODEL_NAME" == "llava-hf/llava-1.5-7b-hf" ]]; then
+    PROMPT_TEMPLATE="USER: <image>\n<prompt> ASSISTANT:"
+elif [[ "$MODEL_NAME" == "microsoft/Phi-3.5-vision-instruct" ]]; then
+    PROMPT_TEMPLATE="<|user|>\n<|image_1|>\n<prompt><|end|>\n<|assistant|>\n"
+elif [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then
+    PROMPT_TEMPLATE="<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|><prompt><|im_end|>\n<|im_start|>assistant\n"
+else
+    echo "No multi-modal prompt template is defined for the model: $MODEL_NAME"
+    echo "Please provide a prompt template using --prompt-template option."
+    echo "Example: --prompt-template 'USER: <image>\n<prompt> ASSISTANT:'"
+    exit 1
+fi
+
+echo "=================================================="
+echo "Disaggregated Multimodal Serving"
+echo "=================================================="
+echo "Model: $MODEL_NAME"
+echo "Prompt Template: $PROMPT_TEMPLATE"
+echo "=================================================="
+
+#export UCX_TLS=sm,self,tcp
+
+echo "UCX Transport: $UCX_TLS (local same-machine mode)"
+echo "=================================================="
+
+# Start frontend (no router mode)
+echo "Starting frontend..."
+python -m dynamo.frontend --http-port=8000 &
+
+# Start processor
+echo "Starting processor..."
+python -m dynamo.vllm --multimodal-processor --model $MODEL_NAME --mm-prompt-template "$PROMPT_TEMPLATE" &
+
+# Configure GPU memory optimization for specific models
+EXTRA_ARGS=""
+if [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then
+    EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 2048"
+fi
+
+# Start encode worker 
+echo "Starting encode worker on GPU 1..."
+CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-encode-worker --model $MODEL_NAME  $EXTRA_ARGS &
+
+# Start prefill worker 
+echo "Starting prefill worker on GPU 2..."
+CUDA_VISIBLE_DEVICES=2 python -m dynamo.vllm --multimodal-worker --is-prefill-worker --model $MODEL_NAME $EXTRA_ARGS &
+
+# Start decode worker
+echo "Starting decode worker on GPU 3..."
+CUDA_VISIBLE_DEVICES=3 python -m dynamo.vllm --multimodal-decode-worker --model $MODEL_NAME $EXTRA_ARGS &
+
+echo "=================================================="
+echo "All components started. Waiting for initialization..."
+echo "=================================================="
+
+# Wait for all background processes to complete
+wait
+