Skip to content

Follow the code in demo.ipynb but model cannot see the image #4

@CatYing

Description

@CatYing

CODE:

from PIL import Image
import torch
import os
from llava.serve.classes.Utils import *
from llava.serve.classes.Compiler import *


from llava.model.builder import load_mixed_pretrained_model
from llava.utils import disable_torch_init
from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria


def render_content_with_text(key, value):
    if FILL_WITH_RANDOM_TEXT:
        if key.find("btn") != -1:
            value = value.replace(TEXT_PLACE_HOLDER, Utils.get_random_text())
        elif key.find("title") != -1:
            value = value.replace(TEXT_PLACE_HOLDER, Utils.get_random_text(length_text=5, space_number=0))
        elif key.find("text") != -1:
            value = value.replace(TEXT_PLACE_HOLDER,
                                  Utils.get_random_text(length_text=56, space_number=7, with_upper_case=False))
    return value


FILL_WITH_RANDOM_TEXT = True
TEXT_PLACE_HOLDER = "[]"

model_path = "/cm/CodeFuse-VLM-14B/CodeFuse-VLM-14B/"
model_base = None
model_name = "qwen-vl-14b"
vision_tower_path = os.path.join(model_path, 'Qwen-VL-visual')
mm_projector_type = "cross_attn"
mm_projector_path = os.path.join(model_path, 'mm_projector/mm_projector.bin')

disable_torch_init()
tokenizer, model, image_processor, context_len = load_mixed_pretrained_model(model_path, model_base, model_name,
                                                                             vision_tower_path, mm_projector_type,
                                                                             mm_projector_path, device_map="auto")

compiler = Compiler("/cm/CodeFuse-VLM-14B/CodeFuse-MFT-VLM/llava/serve/assets/web-dsl-mapping.json")

tokenizer.pad_token_id = tokenizer.eod_id
# model = model.cuda()

image_fn = "/cm/CodeFuse-VLM-14B/image_test_vlm.png"

image = Image.open(image_fn).convert('RGB')
image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
import pdb; pdb.set_trace()

# image.show()


def inference(prompt):
    inputs = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
    prompt = "<|im_start|>user\n" + "Picture 1:\n" + prompt + "<|im_end|>\n" + "<|im_start|>assistant\n"
    inputs += prompt

    tokens = tokenizer(
        inputs,
        max_length=tokenizer.model_max_length,
        padding=True,
        truncation=True,
        return_tensors="pt",
    )
    input_ids = tokens.input_ids.cuda()

    stop_str = tokenizer.pad_token
    keywords = [stop_str]
    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
    with torch.inference_mode():
        output_ids = model.generate(
            input_ids,
            images=image_tensor.unsqueeze(0).cuda(),
            do_sample=True,
            temperature=0.2,
            top_p=0.3,
            top_k=0,
            max_new_tokens=2048,
            return_dict_in_generate=False,
            use_cache=True)
    input_token_len = input_ids.shape[1]
    n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
    if n_diff_input_output > 0:
        print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
    output_text = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
    return output_text


ret = inference("Please generate DSL for the skect on this image:\n")
print(ret)

output_text = ret.replace("<|im_end|>", "").replace("<|im_start|>", "").replace("\n", "")
output_text = output_text.lower()
output_text = output_text.replace("{", "{\n").replace("}", "\n}\n").replace("\n\n", '\n').rstrip("\n")


output_html = compiler.compile(output_text, None, rendering_function=render_content_with_text)
print(output_html)

then the model returns:

I'm sorry, but I cannot see the image you are referring to. Please provide a detailed description of the image or upload the image itself so that I can generate the DSL for you.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions