Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -464,12 +464,11 @@ For multimodal models deployed directly with `NeMoMultimodalDeployable`, use the

```python
from nemo_deploy.multimodal import NemoQueryMultimodalPytorch
from PIL import Image

nq = NemoQueryMultimodalPytorch(url="localhost:8000", model_name="qwen")
output = nq.query_multimodal(
prompts=["What is in this image?"],
images=[Image.open("/path/to/image.jpg")],
images=["https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"],
max_length=100,
top_k=1,
top_p=0.0,
Expand Down
22 changes: 18 additions & 4 deletions nemo_deploy/multimodal/nemo_multimodal_deployable.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,16 +157,24 @@ def apply_chat_template(self, messages, add_generation_prompt=True):
)
return text

def base64_to_image(self, image_base64):
"""Convert base64-encoded image to PIL Image."""
def process_image_input(self, image_source):
"""Process image input from base64-encoded string or HTTP URL.

Args:
image_source (str): Image source - either base64-encoded image string with data URI prefix
(e.g., "data:image;base64,...") or HTTP/HTTPS URL (e.g., "http://example.com/image.jpg")

Returns:
Processed image content suitable for model inference.
"""
if isinstance(self.inference_wrapped_model, QwenVLInferenceWrapper):
from qwen_vl_utils import process_vision_info

messages = [
{
"role": "user",
"content": [
{"type": "image", "image": f"data:image;base64,{image_base64}"},
{"type": "image", "image": image_source},
],
}
]
Expand Down Expand Up @@ -259,14 +267,20 @@ def _infer_fn(
Returns:
dict: sentences.
"""
# Handle temperature=0.0 for greedy decoding
if temperature == 0.0:
LOGGER.warning("temperature=0.0 detected. Setting top_k=1 for greedy sampling.")
top_k = 1
top_p = 0.0

inference_params = CommonInferenceParams(
temperature=float(temperature),
top_k=int(top_k),
top_p=float(top_p),
num_tokens_to_generate=num_tokens_to_generate,
)

images = [self.base64_to_image(img_b64) for img_b64 in images]
images = [self.process_image_input(image_source) for image_source in images]

results = self.generate(
prompts,
Expand Down
14 changes: 11 additions & 3 deletions nemo_deploy/multimodal/query_multimodal.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,9 +195,16 @@ class NemoQueryMultimodalPytorch:

nq = NemoQueryMultimodalPytorch(url="localhost", model_name="qwen")

# Encode image to base64
# Option 1: Use HTTP URL directly
output = nq.query_multimodal(
prompts=["Describe this image"],
images=["http://example.com/image.jpg"],
max_length=100,
)

# Option 2: Encode image to base64 with data URI prefix
with open("image.jpg", "rb") as f:
image_base64 = base64.b64encode(f.read()).decode('utf-8')
image_base64 = "data:image;base64," + base64.b64encode(f.read()).decode('utf-8')

output = nq.query_multimodal(
prompts=["Describe this image"],
Expand Down Expand Up @@ -231,7 +238,8 @@ def query_multimodal(

Args:
prompts (List[str]): List of input text prompts.
images (List[str]): List of base64-encoded image strings.
images (List[str]): List of image strings - either base64-encoded with data URI prefix
(e.g., "data:image;base64,...") or HTTP/HTTPS URLs (e.g., "http://example.com/image.jpg").
max_length (Optional[int]): Maximum number of tokens to generate.
max_batch_size (Optional[int]): Maximum batch size for inference.
top_k (Optional[int]): Limits to the top K tokens to consider at each step.
Expand Down
35 changes: 24 additions & 11 deletions nemo_deploy/service/fastapi_interface_to_pytriton_multimodal.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import numpy as np
import requests
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, model_validator
from pydantic import BaseModel
from pydantic_settings import BaseSettings

from nemo_deploy.multimodal.query_multimodal import NemoQueryMultimodalPytorch
Expand Down Expand Up @@ -82,18 +82,10 @@ class BaseMultimodalRequest(BaseModel):
max_tokens: int = 50
temperature: float = 1.0
top_p: float = 0.0
top_k: int = 1
top_k: int = 0
random_seed: Optional[int] = None
max_batch_size: int = 4

@model_validator(mode="after")
def set_greedy_params(self):
"""Validate parameters for greedy decoding."""
if self.temperature == 0 and self.top_p == 0:
logging.warning("Both temperature and top_p are 0. Setting top_k to 1 to ensure greedy sampling.")
self.top_k = 1
return self


class MultimodalCompletionRequest(BaseMultimodalRequest):
"""Represents a request for multimodal text completion.
Expand Down Expand Up @@ -290,12 +282,33 @@ def dict_to_str(messages):

@app.post("/v1/chat/completions/")
async def chat_completions_v1(request: MultimodalChatCompletionRequest):
"""Defines the multimodal chat completions endpoint and queries the model deployed on PyTriton server."""
"""Defines the multimodal chat completions endpoint and queries the model deployed on PyTriton server.

Supports two image content formats (normalized internally to format 1):
1. {"type": "image", "image": "url_or_base64"}
2. {"type": "image_url", "image_url": {"url": "url_or_base64"}} (OpenAI-style, converted to format 1)
"""
url = f"http://{triton_settings.triton_service_ip}:{triton_settings.triton_service_port}"

prompts = request.messages
if not isinstance(request.messages, list):
prompts = [request.messages]

# Normalize image_url format to image format for consistent processing
for message in prompts:
for content in message["content"]:
if content["type"] == "image_url":
# Convert OpenAI-style image_url to standard image format
if isinstance(content.get("image_url"), dict):
image_data = content["image_url"]["url"]
else:
image_data = content["image_url"]
# Transform to image format
content["type"] = "image"
content["image"] = image_data
# Remove image_url field
content.pop("image_url", None)

# Serialize the dictionary to a JSON string represnetation to be able to convert to numpy array
# (str_list2numpy) and back to list (str_ndarray2list) as required by PyTriton. Using the dictionaries directly
# with these methods is not possible as they expect string type.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,19 +64,16 @@ def load_image_from_path(image_path: str) -> str:
image_path: Path to local image file or URL

Returns:
Base64-encoded image string
Image string - HTTP URL directly or base64-encoded string for local files
"""
if image_path.startswith(("http://", "https://")):
LOGGER.info(f"Loading image from URL: {image_path}")
response = requests.get(image_path, timeout=30)
response.raise_for_status()
image_content = response.content
LOGGER.info(f"Using image URL directly: {image_path}")
return image_path
else:
LOGGER.info(f"Loading image from local path: {image_path}")
LOGGER.info(f"Loading and encoding image from local path: {image_path}")
with open(image_path, "rb") as f:
image_content = f.read()

return base64.b64encode(image_content).decode("utf-8")
return "data:image;base64," + base64.b64encode(image_content).decode("utf-8")


def test_completions_endpoint(base_url: str, model_id: str, prompt: str = None, image_source: str = None) -> None:
Expand Down Expand Up @@ -114,8 +111,8 @@ def test_completions_endpoint(base_url: str, model_id: str, prompt: str = None,
payload["prompt"] = text

try:
image_base64 = load_image_from_path(image_source)
payload["image"] = image_base64
image_data = load_image_from_path(image_source)
payload["image"] = image_data
except Exception as e:
LOGGER.error(f"Failed to load image: {e}")
return
Expand All @@ -130,7 +127,12 @@ def test_completions_endpoint(base_url: str, model_id: str, prompt: str = None,


def test_chat_completions_endpoint(base_url: str, model_id: str, prompt: str = None, image_source: str = None) -> None:
"""Test the chat completions endpoint for multimodal models."""
"""Test the chat completions endpoint for multimodal models.

Supports two image content formats:
1. {"type": "image", "image": "url_or_base64"}
2. {"type": "image_url", "image_url": {"url": "url_or_base64"}} (OpenAI-style)
"""
url = f"{base_url}/v1/chat/completions/"

# Use provided prompt or default
Expand All @@ -141,8 +143,10 @@ def test_chat_completions_endpoint(base_url: str, model_id: str, prompt: str = N

content = []
try:
image_base64 = load_image_from_path(image_source)
content.append({"type": "image", "image": image_base64})
image_data = load_image_from_path(image_source)
# Using format 1: {"type": "image", "image": "url_or_base64"}
# Alternative format 2: {"type": "image_url", "image_url": {"url": "url_or_base64"}}
content.append({"type": "image", "image": image_data})
except Exception as e:
LOGGER.error(f"Failed to load image: {e}")
return
Expand All @@ -167,19 +171,6 @@ def test_chat_completions_endpoint(base_url: str, model_id: str, prompt: str = N
LOGGER.error(f"Error: {response.text}")


def test_models_endpoint(base_url: str) -> None:
"""Test the models endpoint."""
url = f"{base_url}/v1/models"

LOGGER.info(f"Testing models endpoint at {url}")
response = requests.get(url)
LOGGER.info(f"Response status code: {response.status_code}")
if response.status_code == 200:
LOGGER.info(f"Response: {json.dumps(response.json(), indent=2)}")
else:
LOGGER.error(f"Error: {response.text}")


def test_health_endpoint(base_url: str) -> None:
"""Test the health endpoint."""
url = f"{base_url}/v1/health"
Expand Down Expand Up @@ -218,7 +209,6 @@ def main():
test_completions_endpoint(base_url, args.model_id, args.prompt, args.image)
test_chat_completions_endpoint(base_url, args.model_id, args.prompt, args.image)
test_health_endpoint(base_url)
test_models_endpoint(base_url)


if __name__ == "__main__":
Expand Down
18 changes: 7 additions & 11 deletions scripts/deploy/multimodal/query_inframework.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
import logging
import time

import requests
from transformers import AutoProcessor

from nemo_deploy.multimodal.query_multimodal import NemoQueryMultimodalPytorch
Expand All @@ -32,19 +31,16 @@ def load_image_from_path(image_path: str) -> str:
image_path: Path to local image file or URL

Returns:
Base64-encoded image string
Image string - HTTP URL directly or base64-encoded string for local files
"""
if image_path.startswith(("http://", "https://")):
LOGGER.info(f"Loading image from URL: {image_path}")
response = requests.get(image_path, timeout=30)
response.raise_for_status()
image_content = response.content
LOGGER.info(f"Using image URL directly: {image_path}")
return image_path
else:
LOGGER.info(f"Loading image from local path: {image_path}")
LOGGER.info(f"Loading and encoding image from local path: {image_path}")
with open(image_path, "rb") as f:
image_content = f.read()

return base64.b64encode(image_content).decode("utf-8")
return "data:image;base64," + base64.b64encode(image_content).decode("utf-8")


def get_args():
Expand Down Expand Up @@ -121,7 +117,7 @@ def query():
with open(args.prompt_file, "r") as f:
args.prompt = f.read()

image_base64 = load_image_from_path(args.image)
image_source = load_image_from_path(args.image)

if "Qwen" in args.processor_name:
processor = AutoProcessor.from_pretrained(args.processor_name)
Expand All @@ -146,7 +142,7 @@ def query():
nemo_query = NemoQueryMultimodalPytorch(args.url, args.model_name)
outputs = nemo_query.query_multimodal(
prompts=[args.prompt],
images=[image_base64],
images=[image_source],
max_length=args.max_output_len,
max_batch_size=args.max_batch_size,
top_k=args.top_k,
Expand Down
Loading
Loading