Follow the code in demo.ipynb but model cannot see the image

CODE:

```python
from PIL import Image
import torch
import os
from llava.serve.classes.Utils import *
from llava.serve.classes.Compiler import *


from llava.model.builder import load_mixed_pretrained_model
from llava.utils import disable_torch_init
from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria


def render_content_with_text(key, value):
    if FILL_WITH_RANDOM_TEXT:
        if key.find("btn") != -1:
            value = value.replace(TEXT_PLACE_HOLDER, Utils.get_random_text())
        elif key.find("title") != -1:
            value = value.replace(TEXT_PLACE_HOLDER, Utils.get_random_text(length_text=5, space_number=0))
        elif key.find("text") != -1:
            value = value.replace(TEXT_PLACE_HOLDER,
                                  Utils.get_random_text(length_text=56, space_number=7, with_upper_case=False))
    return value


FILL_WITH_RANDOM_TEXT = True
TEXT_PLACE_HOLDER = "[]"

model_path = "/cm/CodeFuse-VLM-14B/CodeFuse-VLM-14B/"
model_base = None
model_name = "qwen-vl-14b"
vision_tower_path = os.path.join(model_path, 'Qwen-VL-visual')
mm_projector_type = "cross_attn"
mm_projector_path = os.path.join(model_path, 'mm_projector/mm_projector.bin')

disable_torch_init()
tokenizer, model, image_processor, context_len = load_mixed_pretrained_model(model_path, model_base, model_name,
                                                                             vision_tower_path, mm_projector_type,
                                                                             mm_projector_path, device_map="auto")

compiler = Compiler("/cm/CodeFuse-VLM-14B/CodeFuse-MFT-VLM/llava/serve/assets/web-dsl-mapping.json")

tokenizer.pad_token_id = tokenizer.eod_id
# model = model.cuda()

image_fn = "/cm/CodeFuse-VLM-14B/image_test_vlm.png"

image = Image.open(image_fn).convert('RGB')
image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
import pdb; pdb.set_trace()

# image.show()


def inference(prompt):
    inputs = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
    prompt = "<|im_start|>user\n" + "Picture 1:\n" + prompt + "<|im_end|>\n" + "<|im_start|>assistant\n"
    inputs += prompt

    tokens = tokenizer(
        inputs,
        max_length=tokenizer.model_max_length,
        padding=True,
        truncation=True,
        return_tensors="pt",
    )
    input_ids = tokens.input_ids.cuda()

    stop_str = tokenizer.pad_token
    keywords = [stop_str]
    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
    with torch.inference_mode():
        output_ids = model.generate(
            input_ids,
            images=image_tensor.unsqueeze(0).cuda(),
            do_sample=True,
            temperature=0.2,
            top_p=0.3,
            top_k=0,
            max_new_tokens=2048,
            return_dict_in_generate=False,
            use_cache=True)
    input_token_len = input_ids.shape[1]
    n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
    if n_diff_input_output > 0:
        print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
    output_text = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
    return output_text


ret = inference("Please generate DSL for the skect on this image:\n")
print(ret)

output_text = ret.replace("<|im_end|>", "").replace("<|im_start|>", "").replace("\n", "")
output_text = output_text.lower()
output_text = output_text.replace("{", "{\n").replace("}", "\n}\n").replace("\n\n", '\n').rstrip("\n")


output_html = compiler.compile(output_text, None, rendering_function=render_content_with_text)
print(output_html)
```


then  the model returns:

I'm sorry, but I cannot see the image you are referring to. Please provide a detailed description of the image or upload the image itself so that I can generate the DSL for you.




Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Follow the code in demo.ipynb but model cannot see the image #4

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Follow the code in demo.ipynb but model cannot see the image #4

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions