-
Notifications
You must be signed in to change notification settings - Fork 926
Open
Description
Getting Error while generating Image Caption Using Blip2
import requests
from PIL import Image
url = 'https://media.newyorker.com/cartoons/63dc6847be24a6a76d90eb99/master/w_1160,c_limit/230213_a26611_838.jpg'
image = Image.open(requests.get(url, stream = True).raw).convert('RGB')
display(image.resize((596, 437)))
from transformers import AutoProcessor, Blip2ForConditionalGeneration
import torch
processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained(
"Salesforce/blip2-opt-2.7b",
device_map="auto",
load_in_8bit=True
)
#using GPU to make text generation faster
device = "cuda" if torch.cuda.is_available() else "cpu"
inputs = processor(image, return_tensors="pt").to(device, torch.float16)
generated_ids = model.generate(**inputs, max_new_tokens=20)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
print(generated_text)
> ---------------------------------------------------------------------------
> RuntimeError Traceback (most recent call last)
> Cell In[31], line 3
> 1 inputs = processor(image, return_tensors="pt").to(device, torch.float16)
> ----> 3 generated_ids = model.generate(**inputs, max_new_tokens=20)
> 4 generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
> 5 print(generated_text)
>
> File ~/.local/lib/python3.8/site-packages/torch/utils/_contextlib.py:116, in context_decorator.<locals>.decorate_context(*args, **kwargs)
> 113 @functools.wraps(func)
> 114 def decorate_context(*args, **kwargs):
> 115 with ctx_factory():
> --> 116 return func(*args, **kwargs)
>
> File ~/.local/lib/python3.8/site-packages/transformers/models/blip_2/modeling_blip_2.py:2316, in Blip2ForConditionalGeneration.generate(self, pixel_values, input_ids, attention_mask, interpolate_pos_encoding, **generate_kwargs)
> 2314 if getattr(self.config, "image_token_index", None) is not None:
> 2315 special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
> -> 2316 inputs_embeds[special_image_mask] = language_model_inputs.flatten()
> 2317 else:
> 2318 logger.warning_once(
> 2319 "Expanding inputs for image tokens in BLIP-2 should be done in processing. "
> 2320 "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
> 2321 "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
> 2322 )
>
> RuntimeError: shape mismatch: value tensor of shape [81920] cannot be broadcast to indexing result of shape [0]
Metadata
Metadata
Assignees
Labels
No labels