Skip to content

Multi-image inference using LLava #1707

@DavidAbrahamyan

Description

@DavidAbrahamyan

I was trying to make a multi-image inference using llava-v1.6-vicuna-7b. Here is my code:

from PIL import Image
import habana_frameworks.torch as ht
import habana_frameworks.torch.core as htcore
import torch
from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration, AutoConfig, AutoModelForVision2Seq, pipeline
from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
import time
import os

adapt_transformers_to_gaudi()

print("----------------------------=====================-------------------------------")
print("Trying LLava-Next(1.6)-Vicuna-7B")
print("Setting the device to hpu")
device = torch.device("hpu")

print("Loading the model")
args_model_name_or_path = "/workspace/models/model_llava_v1_6_vicuna_7b"
model_type = AutoConfig.from_pretrained(args_model_name_or_path).model_type

print("Model type: ", model_type)

print("Loading the processor")
args_processor = AutoProcessor.from_pretrained(args_model_name_or_path)

model_dtype = torch.bfloat16

url = "https://www.ilankelman.org/stopsigns/australia.jpg"
image_stop = Image.open(requests.get(url, stream=True).raw)

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image_cats = Image.open(requests.get(url, stream=True).raw)

url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
image_snowman = Image.open(requests.get(url, stream=True).raw)

# Prepare a batch of two prompts, where the first one is a multi-turn conversation and the second is not
conversation_1 = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "What is shown in this image?"},
            ],
    },
    {
        "role": "assistant",
        "content": [
            {"type": "text", "text": "There is a red stop sign in the image."},
            ],
    },
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "What about this image? How many cats do you see?"},
            ],
    },
]

conversation_2 = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "What is shown in this image?"},
            ],
    },
]

prompt_1 = args_processor.apply_chat_template(conversation_1, add_generation_prompt=True)
prompt_2 = args_processor.apply_chat_template(conversation_2, add_generation_prompt=True)
prompts = [prompt_1, prompt_2]

print("Creating the pipeline")
generator = pipeline(
            "image-to-text",
            model=args_model_name_or_path,
            tokenizer=args_model_name_or_path,
            image_processor=args_model_name_or_path,
            torch_dtype=model_dtype,
            device="hpu",
        )

print("Initializing a couple of params (kwargs, batch size, nuber of iterations)")
generate_kwargs = {"max_new_tokens": 200, "do_sample": False}  # Customize as needed
batch_size = 4  # Adjust the batch size as needed
args_n_iterations = 1
start = time.perf_counter()
results = generator(images=[image_stop, image_cats, image_snowman], prompt=prompts, generate_kwargs=generate_kwargs)

end = time.perf_counter()
duration = end - start
print("Total duration:", duration)
print(results)
print("----------------------------=====================-------------------------------")

however, I get the following error:

ValueError: The input provided to the model are wrong. The number of image tokens is 2 while the number of image given to the model is 1. This prevents correct indexing and breaks batch generation.

Can anyone help me identify what the issue is? I would very much appreciate any help.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions