-
Notifications
You must be signed in to change notification settings - Fork 270
Open
Description
I was trying to make a multi-image inference using llava-v1.6-vicuna-7b. Here is my code:
from PIL import Image
import habana_frameworks.torch as ht
import habana_frameworks.torch.core as htcore
import torch
from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration, AutoConfig, AutoModelForVision2Seq, pipeline
from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
import time
import os
adapt_transformers_to_gaudi()
print("----------------------------=====================-------------------------------")
print("Trying LLava-Next(1.6)-Vicuna-7B")
print("Setting the device to hpu")
device = torch.device("hpu")
print("Loading the model")
args_model_name_or_path = "/workspace/models/model_llava_v1_6_vicuna_7b"
model_type = AutoConfig.from_pretrained(args_model_name_or_path).model_type
print("Model type: ", model_type)
print("Loading the processor")
args_processor = AutoProcessor.from_pretrained(args_model_name_or_path)
model_dtype = torch.bfloat16
url = "https://www.ilankelman.org/stopsigns/australia.jpg"
image_stop = Image.open(requests.get(url, stream=True).raw)
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image_cats = Image.open(requests.get(url, stream=True).raw)
url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
image_snowman = Image.open(requests.get(url, stream=True).raw)
# Prepare a batch of two prompts, where the first one is a multi-turn conversation and the second is not
conversation_1 = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "What is shown in this image?"},
],
},
{
"role": "assistant",
"content": [
{"type": "text", "text": "There is a red stop sign in the image."},
],
},
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "What about this image? How many cats do you see?"},
],
},
]
conversation_2 = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "What is shown in this image?"},
],
},
]
prompt_1 = args_processor.apply_chat_template(conversation_1, add_generation_prompt=True)
prompt_2 = args_processor.apply_chat_template(conversation_2, add_generation_prompt=True)
prompts = [prompt_1, prompt_2]
print("Creating the pipeline")
generator = pipeline(
"image-to-text",
model=args_model_name_or_path,
tokenizer=args_model_name_or_path,
image_processor=args_model_name_or_path,
torch_dtype=model_dtype,
device="hpu",
)
print("Initializing a couple of params (kwargs, batch size, nuber of iterations)")
generate_kwargs = {"max_new_tokens": 200, "do_sample": False} # Customize as needed
batch_size = 4 # Adjust the batch size as needed
args_n_iterations = 1
start = time.perf_counter()
results = generator(images=[image_stop, image_cats, image_snowman], prompt=prompts, generate_kwargs=generate_kwargs)
end = time.perf_counter()
duration = end - start
print("Total duration:", duration)
print(results)
print("----------------------------=====================-------------------------------")
however, I get the following error:
ValueError: The input provided to the model are wrong. The number of image tokens is 2 while the number of image given to the model is 1. This prevents correct indexing and breaks batch generation.
Can anyone help me identify what the issue is? I would very much appreciate any help.
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels