NVIDIA-NeMo · meatybobby · Nov 14, 2025
@@ -464,12 +464,11 @@ For multimodal models deployed directly with `NeMoMultimodalDeployable`, use the
 
 ```python
 from nemo_deploy.multimodal import NemoQueryMultimodalPytorch
-from PIL import Image
 
 nq = NemoQueryMultimodalPytorch(url="localhost:8000", model_name="qwen")
 output = nq.query_multimodal(
     prompts=["What is in this image?"],
-    images=[Image.open("/path/to/image.jpg")],
+    images=["https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"],
     max_length=100,
     top_k=1,
     top_p=0.0,

@@ -157,16 +157,24 @@ def apply_chat_template(self, messages, add_generation_prompt=True):
         )
         return text
 
-    def base64_to_image(self, image_base64):
-        """Convert base64-encoded image to PIL Image."""
+    def process_image_input(self, image_source):
+        """Process image input from base64-encoded string or HTTP URL.
+
+        Args:
+            image_source (str): Image source - either base64-encoded image string with data URI prefix
+                               (e.g., "data:image;base64,...") or HTTP/HTTPS URL (e.g., "http://example.com/image.jpg")
+
+        Returns:
+            Processed image content suitable for model inference.
+        """
         if isinstance(self.inference_wrapped_model, QwenVLInferenceWrapper):
             from qwen_vl_utils import process_vision_info
 
             messages = [
                 {
                     "role": "user",
                     "content": [
-                        {"type": "image", "image": f"data:image;base64,{image_base64}"},
+                        {"type": "image", "image": image_source},
                     ],
                 }
             ]
@@ -259,14 +267,20 @@ def _infer_fn(
         Returns:
             dict: sentences.
         """
+        # Handle temperature=0.0 for greedy decoding
+        if temperature == 0.0:
+            LOGGER.warning("temperature=0.0 detected. Setting top_k=1 for greedy sampling.")
+            top_k = 1
+            top_p = 0.0
+
         inference_params = CommonInferenceParams(
             temperature=float(temperature),
             top_k=int(top_k),
             top_p=float(top_p),
             num_tokens_to_generate=num_tokens_to_generate,
         )
 
-        images = [self.base64_to_image(img_b64) for img_b64 in images]
+        images = [self.process_image_input(image_source) for image_source in images]
 
         results = self.generate(
             prompts,

@@ -195,9 +195,16 @@ class NemoQueryMultimodalPytorch:
 
         nq = NemoQueryMultimodalPytorch(url="localhost", model_name="qwen")
 
-        # Encode image to base64
+        # Option 1: Use HTTP URL directly
+        output = nq.query_multimodal(
+            prompts=["Describe this image"],
+            images=["http://example.com/image.jpg"],
+            max_length=100,
+        )
+
+        # Option 2: Encode image to base64 with data URI prefix
         with open("image.jpg", "rb") as f:
-            image_base64 = base64.b64encode(f.read()).decode('utf-8')
+            image_base64 = "data:image;base64," + base64.b64encode(f.read()).decode('utf-8')
 
         output = nq.query_multimodal(
             prompts=["Describe this image"],
@@ -231,7 +238,8 @@ def query_multimodal(
 
         Args:
             prompts (List[str]): List of input text prompts.
-            images (List[str]): List of base64-encoded image strings.
+            images (List[str]): List of image strings - either base64-encoded with data URI prefix
+                               (e.g., "data:image;base64,...") or HTTP/HTTPS URLs (e.g., "http://example.com/image.jpg").
             max_length (Optional[int]): Maximum number of tokens to generate.
             max_batch_size (Optional[int]): Maximum batch size for inference.
             top_k (Optional[int]): Limits to the top K tokens to consider at each step.

@@ -19,7 +19,7 @@
 import numpy as np
 import requests
 from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel, model_validator
+from pydantic import BaseModel
 from pydantic_settings import BaseSettings
 
 from nemo_deploy.multimodal.query_multimodal import NemoQueryMultimodalPytorch
@@ -82,18 +82,10 @@ class BaseMultimodalRequest(BaseModel):
     max_tokens: int = 50
     temperature: float = 1.0
     top_p: float = 0.0
-    top_k: int = 1
+    top_k: int = 0
     random_seed: Optional[int] = None
     max_batch_size: int = 4
 
-    @model_validator(mode="after")
-    def set_greedy_params(self):
-        """Validate parameters for greedy decoding."""
-        if self.temperature == 0 and self.top_p == 0:
-            logging.warning("Both temperature and top_p are 0. Setting top_k to 1 to ensure greedy sampling.")
-            self.top_k = 1
-        return self
-
 
 class MultimodalCompletionRequest(BaseMultimodalRequest):
     """Represents a request for multimodal text completion.
@@ -290,12 +282,33 @@ def dict_to_str(messages):
 
 @app.post("/v1/chat/completions/")
 async def chat_completions_v1(request: MultimodalChatCompletionRequest):
-    """Defines the multimodal chat completions endpoint and queries the model deployed on PyTriton server."""
+    """Defines the multimodal chat completions endpoint and queries the model deployed on PyTriton server.
+
+    Supports two image content formats (normalized internally to format 1):
+    1. {"type": "image", "image": "url_or_base64"}
+    2. {"type": "image_url", "image_url": {"url": "url_or_base64"}} (OpenAI-style, converted to format 1)
+    """
     url = f"http://{triton_settings.triton_service_ip}:{triton_settings.triton_service_port}"
 
     prompts = request.messages
     if not isinstance(request.messages, list):
         prompts = [request.messages]
+
+    # Normalize image_url format to image format for consistent processing
+    for message in prompts:
+        for content in message["content"]:
+            if content["type"] == "image_url":
+                # Convert OpenAI-style image_url to standard image format
+                if isinstance(content.get("image_url"), dict):
+                    image_data = content["image_url"]["url"]
+                else:
+                    image_data = content["image_url"]
+                # Transform to image format
+                content["type"] = "image"
+                content["image"] = image_data
+                # Remove image_url field
+                content.pop("image_url", None)
+
     # Serialize the dictionary to a JSON string represnetation to be able to convert to numpy array
     # (str_list2numpy) and back to list (str_ndarray2list) as required by PyTriton. Using the dictionaries directly
     # with these methods is not possible as they expect string type.

@@ -64,19 +64,16 @@ def load_image_from_path(image_path: str) -> str:
         image_path: Path to local image file or URL
 
     Returns:
-        Base64-encoded image string
+        Image string - HTTP URL directly or base64-encoded string for local files
     """
     if image_path.startswith(("http://", "https://")):
-        LOGGER.info(f"Loading image from URL: {image_path}")
-        response = requests.get(image_path, timeout=30)
-        response.raise_for_status()
-        image_content = response.content
+        LOGGER.info(f"Using image URL directly: {image_path}")
+        return image_path
     else:
-        LOGGER.info(f"Loading image from local path: {image_path}")
+        LOGGER.info(f"Loading and encoding image from local path: {image_path}")
         with open(image_path, "rb") as f:
             image_content = f.read()
-
-    return base64.b64encode(image_content).decode("utf-8")
+        return "data:image;base64," + base64.b64encode(image_content).decode("utf-8")
 
 
 def test_completions_endpoint(base_url: str, model_id: str, prompt: str = None, image_source: str = None) -> None:
@@ -114,8 +111,8 @@ def test_completions_endpoint(base_url: str, model_id: str, prompt: str = None,
     payload["prompt"] = text
 
     try:
-        image_base64 = load_image_from_path(image_source)
-        payload["image"] = image_base64
+        image_data = load_image_from_path(image_source)
+        payload["image"] = image_data
     except Exception as e:
         LOGGER.error(f"Failed to load image: {e}")
         return
@@ -130,7 +127,12 @@ def test_completions_endpoint(base_url: str, model_id: str, prompt: str = None,
 
 
 def test_chat_completions_endpoint(base_url: str, model_id: str, prompt: str = None, image_source: str = None) -> None:
-    """Test the chat completions endpoint for multimodal models."""
+    """Test the chat completions endpoint for multimodal models.
+
+    Supports two image content formats:
+    1. {"type": "image", "image": "url_or_base64"}
+    2. {"type": "image_url", "image_url": {"url": "url_or_base64"}} (OpenAI-style)
+    """
     url = f"{base_url}/v1/chat/completions/"
 
     # Use provided prompt or default
@@ -141,8 +143,10 @@ def test_chat_completions_endpoint(base_url: str, model_id: str, prompt: str = N
 
     content = []
     try:
-        image_base64 = load_image_from_path(image_source)
-        content.append({"type": "image", "image": image_base64})
+        image_data = load_image_from_path(image_source)
+        # Using format 1: {"type": "image", "image": "url_or_base64"}
+        # Alternative format 2: {"type": "image_url", "image_url": {"url": "url_or_base64"}}
+        content.append({"type": "image", "image": image_data})
     except Exception as e:
         LOGGER.error(f"Failed to load image: {e}")
         return
@@ -167,19 +171,6 @@ def test_chat_completions_endpoint(base_url: str, model_id: str, prompt: str = N
         LOGGER.error(f"Error: {response.text}")
 
 
-def test_models_endpoint(base_url: str) -> None:
-    """Test the models endpoint."""
-    url = f"{base_url}/v1/models"
-
-    LOGGER.info(f"Testing models endpoint at {url}")
-    response = requests.get(url)
-    LOGGER.info(f"Response status code: {response.status_code}")
-    if response.status_code == 200:
-        LOGGER.info(f"Response: {json.dumps(response.json(), indent=2)}")
-    else:
-        LOGGER.error(f"Error: {response.text}")
-
-
 def test_health_endpoint(base_url: str) -> None:
     """Test the health endpoint."""
     url = f"{base_url}/v1/health"
@@ -218,7 +209,6 @@ def main():
     test_completions_endpoint(base_url, args.model_id, args.prompt, args.image)
     test_chat_completions_endpoint(base_url, args.model_id, args.prompt, args.image)
     test_health_endpoint(base_url)
-    test_models_endpoint(base_url)
 
 
 if __name__ == "__main__":

@@ -17,7 +17,6 @@
 import logging
 import time
 
-import requests
 from transformers import AutoProcessor
 
 from nemo_deploy.multimodal.query_multimodal import NemoQueryMultimodalPytorch
@@ -32,19 +31,16 @@ def load_image_from_path(image_path: str) -> str:
         image_path: Path to local image file or URL
 
     Returns:
-        Base64-encoded image string
+        Image string - HTTP URL directly or base64-encoded string for local files
     """
     if image_path.startswith(("http://", "https://")):
-        LOGGER.info(f"Loading image from URL: {image_path}")
-        response = requests.get(image_path, timeout=30)
-        response.raise_for_status()
-        image_content = response.content
+        LOGGER.info(f"Using image URL directly: {image_path}")
+        return image_path
     else:
-        LOGGER.info(f"Loading image from local path: {image_path}")
+        LOGGER.info(f"Loading and encoding image from local path: {image_path}")
         with open(image_path, "rb") as f:
             image_content = f.read()
-
-    return base64.b64encode(image_content).decode("utf-8")
+        return "data:image;base64," + base64.b64encode(image_content).decode("utf-8")
 
 
 def get_args():
@@ -121,7 +117,7 @@ def query():
         with open(args.prompt_file, "r") as f:
             args.prompt = f.read()
 
-    image_base64 = load_image_from_path(args.image)
+    image_source = load_image_from_path(args.image)
 
     if "Qwen" in args.processor_name:
         processor = AutoProcessor.from_pretrained(args.processor_name)
@@ -146,7 +142,7 @@ def query():
     nemo_query = NemoQueryMultimodalPytorch(args.url, args.model_name)
     outputs = nemo_query.query_multimodal(
         prompts=[args.prompt],
-        images=[image_base64],
+        images=[image_source],
         max_length=args.max_output_len,
         max_batch_size=args.max_batch_size,
         top_k=args.top_k,