ai-dynamo
diff --git a/‎examples/multimodal/launch/agg.sh‎ renamed to ‎components/backends/vllm/launch/agg_multimodal.sh‎
Lines changed: 5 additions & 5 deletions b/‎examples/multimodal/launch/agg.sh‎ renamed to ‎components/backends/vllm/launch/agg_multimodal.sh‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎components/src/dynamo/vllm/args.py‎
Lines changed: 66 additions & 2 deletions b/‎components/src/dynamo/vllm/args.py‎
Lines changed: 66 additions & 2 deletions
diff --git a/‎components/src/dynamo/vllm/main.py‎
Lines changed: 157 additions & 1 deletion b/‎components/src/dynamo/vllm/main.py‎
Lines changed: 157 additions & 1 deletion
diff --git a/‎components/src/dynamo/vllm/multimodal_handlers/__init__.py‎
Lines changed: 16 additions & 0 deletions b/‎components/src/dynamo/vllm/multimodal_handlers/__init__.py‎
Lines changed: 16 additions & 0 deletions
@@ -55,18 +55,18 @@ fi
 # run ingress
 python -m dynamo.frontend --http-port=8000 &
 
-# run processor
-python3 components/processor.py --model $MODEL_NAME --prompt-template "$PROMPT_TEMPLATE" &
-
 # To make Qwen2.5-VL fit in A100 40GB, set the following extra arguments
 EXTRA_ARGS=""
 if [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then
     EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 2048"
 fi
 
+# run processor
+python -m dynamo.vllm --multimodal-processor --model $MODEL_NAME --mm-prompt-template "$PROMPT_TEMPLATE" &
+
 # run E/P/D workers
-CUDA_VISIBLE_DEVICES=0 python3 components/encode_worker.py --model $MODEL_NAME &
-CUDA_VISIBLE_DEVICES=1 python3 components/worker.py --model $MODEL_NAME --worker-type prefill $EXTRA_ARGS &
+CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-encode-worker --model $MODEL_NAME &
+CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-worker --model $MODEL_NAME $EXTRA_ARGS &
 
 # Wait for all background processes to complete
 wait
@@ -65,6 +65,11 @@ class Config:
     tool_call_parser: Optional[str] = None
     reasoning_parser: Optional[str] = None
 
+    # multimodal options
+    multimodal_processor: bool = False
+    multimodal_encode_worker: bool = False
+    multimodal_worker: bool = False
+    mm_prompt_template: str = "USER: <image>\n<prompt> ASSISTANT:"
     # dump config to file
     dump_config_to: Optional[str] = None
 
@@ -137,6 +142,34 @@ def parse_args() -> Config:
         default=None,
         help="Path to a custom Jinja template file to override the model's default chat template. This template will take precedence over any template found in the model repository.",
     )
+    parser.add_argument(
+        "--multimodal-processor",
+        action="store_true",
+        help="Run as multimodal processor component for handling multimodal requests",
+    )
+    parser.add_argument(
+        "--multimodal-encode-worker",
+        action="store_true",
+        help="Run as multimodal encode worker component for processing images/videos",
+    )
+    parser.add_argument(
+        "--multimodal-worker",
+        action="store_true",
+        help="Run as multimodal worker component for LLM inference with multimodal data",
+    )
+    parser.add_argument(
+        "--mm-prompt-template",
+        type=str,
+        default="USER: <image>\n<prompt> ASSISTANT:",
+        help=(
+            "Different multi-modal models expect the prompt to contain different special media prompts. "
+            "The processor will use this argument to construct the final prompt. "
+            "User prompt will replace '<prompt>' in the provided template. "
+            "For example, if the user prompt is 'please describe the image' and the prompt template is "
+            "'USER: <image> <prompt> ASSISTANT:', the resulting prompt is "
+            "'USER: <image> please describe the image ASSISTANT:'."
+        ),
+    )
     add_config_dump_args(parser)
 
     parser = AsyncEngineArgs.add_cli_args(parser)
@@ -161,8 +194,35 @@ def parse_args() -> Config:
         config.served_model_name = None
 
     config.namespace = os.environ.get("DYN_NAMESPACE", "dynamo")
-    config.component = "prefill" if args.is_prefill_worker else "backend"
-    config.endpoint = "generate"
+
+    # Check multimodal role exclusivity
+    mm_flags = (
+        int(bool(args.multimodal_processor))
+        + int(bool(args.multimodal_encode_worker))
+        + int(bool(args.multimodal_worker))
+    )
+    if mm_flags > 1:
+        raise ValueError(
+            "Use only one of --multimodal-processor, --multimodal-encode-worker, or --multimodal-worker"
+        )
+
+    # Set component and endpoint based on worker type
+    if args.multimodal_processor:
+        config.component = "processor"
+        config.endpoint = "generate"
+    elif args.multimodal_encode_worker:
+        config.component = "encoder"
+        config.endpoint = "generate"
+    elif args.multimodal_worker and args.is_prefill_worker:
+        config.component = "prefill"
+        config.endpoint = "generate"
+    elif args.is_prefill_worker:
+        config.component = "prefill"
+        config.endpoint = "generate"
+    else:
+        config.component = "backend"
+        config.endpoint = "generate"
+
     config.engine_args = engine_args
     config.is_prefill_worker = args.is_prefill_worker
     config.is_decode_worker = args.is_decode_worker
@@ -173,6 +233,10 @@ def parse_args() -> Config:
     config.tool_call_parser = args.dyn_tool_call_parser
     config.reasoning_parser = args.dyn_reasoning_parser
     config.custom_jinja_template = args.custom_jinja_template
+    config.multimodal_processor = args.multimodal_processor
+    config.multimodal_encode_worker = args.multimodal_encode_worker
+    config.multimodal_worker = args.multimodal_worker
+    config.mm_prompt_template = args.mm_prompt_template
 
     # Validate custom Jinja template file exists if provided
     if config.custom_jinja_template is not None:
 
@@ -26,6 +26,11 @@
 )
 from dynamo.runtime import DistributedRuntime, dynamo_worker
 from dynamo.runtime.logging import configure_dynamo_logging
+from dynamo.vllm.multimodal_handlers import (
+    EncodeWorkerHandler,
+    MultimodalPDWorkerHandler,
+    ProcessorHandler,
+)
 
 from .args import ENABLE_LMCACHE, Config, configure_ports, overwrite_args, parse_args
 from .handlers import DecodeWorkerHandler, PrefillWorkerHandler
@@ -92,7 +97,17 @@ def signal_handler():
     if not os.path.exists(config.model):
         config.model = config.engine_args.model = await fetch_llm(config.model)
 
-    if config.is_prefill_worker:
+    # Route to appropriate initialization based on config flags
+    if config.multimodal_processor:
+        await init_multimodal_processor(runtime, config)
+        logger.debug("init_multimodal_processor completed")
+    elif config.multimodal_encode_worker:
+        await init_multimodal_encode_worker(runtime, config)
+        logger.debug("init_multimodal_encode_worker completed")
+    elif config.multimodal_worker:
+        await init_multimodal_worker(runtime, config)
+        logger.debug("init_multimodal_worker completed")
+    elif config.is_prefill_worker:
         await init_prefill(runtime, config)
         logger.debug("init_prefill completed")
     else:
@@ -430,6 +445,147 @@ def get_engine_cache_info(engine: AsyncLLM):
         raise
 
 
+async def init_multimodal_processor(runtime: DistributedRuntime, config: Config):
+    """Initialize multimodal processor component"""
+    component = runtime.namespace(config.namespace).component(config.component)
+    await component.create_service()
+
+    generate_endpoint = component.endpoint(config.endpoint)
+
+    # Get encode worker client
+    encode_worker_client = (
+        await runtime.namespace(config.namespace)
+        .component("encoder")
+        .endpoint("generate")
+        .client()
+    )
+
+    # Get prompt template from args (must be passed via environment or command line)
+    mm_prompt_template = config.mm_prompt_template
+
+    handler = ProcessorHandler(
+        config.engine_args,
+        encode_worker_client,
+        mm_prompt_template,
+    )
+
+    logger.info("Waiting for Encoder Worker Instances ...")
+    await encode_worker_client.wait_for_instances()
+
+    # Register the endpoint as entrypoint to a model
+    await register_llm(
+        ModelInput.Text,  # Custom processor is used and this type bypasses SDK processor
+        ModelType.Chat,
+        generate_endpoint,
+        config.model,
+        config.served_model_name,
+        kv_cache_block_size=config.engine_args.block_size,
+    )
+
+    logger.info("Starting to serve the processor endpoint...")
+
+    try:
+        await asyncio.gather(
+            generate_endpoint.serve_endpoint(
+                handler.generate, metrics_labels=[("model", config.model)]
+            ),
+        )
+    except Exception as e:
+        logger.error(f"Failed to serve endpoints: {e}")
+        raise
+    finally:
+        handler.cleanup()
+
+
+async def init_multimodal_encode_worker(runtime: DistributedRuntime, config: Config):
+    """Initialize multimodal encode worker component"""
+    component = runtime.namespace(config.namespace).component(config.component)
+    await component.create_service()
+
+    generate_endpoint = component.endpoint(config.endpoint)
+
+    # Get PD worker client
+    # In multimodal mode, the PD worker always registers as "backend"
+    # (even in disaggregated mode with prefill/decode split, we still connect to "backend")
+    pd_worker_client = (
+        await runtime.namespace(config.namespace)
+        .component("backend")
+        .endpoint("generate")
+        .client()
+    )
+
+    handler = EncodeWorkerHandler(
+        config.engine_args,
+        pd_worker_client,
+    )
+    await handler.async_init(runtime)
+    logger.info("Waiting for PD Worker Instances ...")
+    await pd_worker_client.wait_for_instances()
+    logger.info("Starting to serve the encode worker endpoint...")
+
+    try:
+        await asyncio.gather(
+            generate_endpoint.serve_endpoint(
+                handler.generate, metrics_labels=[("model", config.model)]
+            ),
+        )
+    except Exception as e:
+        logger.error(f"Failed to serve endpoints: {e}")
+        raise
+    finally:
+        handler.cleanup()
+
+
+async def init_multimodal_worker(runtime: DistributedRuntime, config: Config):
+    """Initialize multimodal worker component for aggregated or disaggregated mode"""
+
+    component = runtime.namespace(config.namespace).component(config.component)
+    await component.create_service()
+
+    generate_endpoint = component.endpoint(config.endpoint)
+    clear_endpoint = component.endpoint("clear_kv_blocks")
+
+    engine_client, vllm_config, default_sampling_params = setup_vllm_engine(config)
+
+    # TODO: Support Disaggregated mode separately
+    client = (
+        await runtime.namespace(config.namespace)
+        .component("backend")
+        .endpoint("generate")
+        .client()
+    )
+
+    handler = MultimodalPDWorkerHandler(
+        runtime, component, engine_client, config, client
+    )
+
+    await handler.async_init(runtime)
+
+    # Set up KV event publisher for prefix caching if enabled
+    kv_publisher = setup_kv_event_publisher(
+        config, component, generate_endpoint, vllm_config
+    )
+    if kv_publisher:
+        handler.kv_publisher = kv_publisher
+
+    metrics_labels = [("model", config.model)]
+
+    try:
+        await asyncio.gather(
+            generate_endpoint.serve_endpoint(
+                handler.generate, metrics_labels=metrics_labels
+            ),
+            clear_endpoint.serve_endpoint(
+                handler.clear_kv_blocks, metrics_labels=metrics_labels
+            ),
+        )
+    except Exception as e:
+        logger.error(f"Failed to serve endpoints: {e}")
+        raise
+    finally:
+        handler.cleanup()
+
+
 def main():
     uvloop.run(worker())
 
 
@@ -0,0 +1,16 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from dynamo.vllm.multimodal_handlers.encode_worker_handler import EncodeWorkerHandler
+from dynamo.vllm.multimodal_handlers.processor_handler import ProcessorHandler
+from dynamo.vllm.multimodal_handlers.worker_handler import (
+    MultimodalDecodeWorkerHandler,
+    MultimodalPDWorkerHandler,
+)
+
+__all__ = [
+    "EncodeWorkerHandler",
+    "ProcessorHandler",
+    "MultimodalPDWorkerHandler",
+    "MultimodalDecodeWorkerHandler",
+]