-
Notifications
You must be signed in to change notification settings - Fork 9
Open
Description
CODE:
from PIL import Image
import torch
import os
from llava.serve.classes.Utils import *
from llava.serve.classes.Compiler import *
from llava.model.builder import load_mixed_pretrained_model
from llava.utils import disable_torch_init
from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
def render_content_with_text(key, value):
if FILL_WITH_RANDOM_TEXT:
if key.find("btn") != -1:
value = value.replace(TEXT_PLACE_HOLDER, Utils.get_random_text())
elif key.find("title") != -1:
value = value.replace(TEXT_PLACE_HOLDER, Utils.get_random_text(length_text=5, space_number=0))
elif key.find("text") != -1:
value = value.replace(TEXT_PLACE_HOLDER,
Utils.get_random_text(length_text=56, space_number=7, with_upper_case=False))
return value
FILL_WITH_RANDOM_TEXT = True
TEXT_PLACE_HOLDER = "[]"
model_path = "/cm/CodeFuse-VLM-14B/CodeFuse-VLM-14B/"
model_base = None
model_name = "qwen-vl-14b"
vision_tower_path = os.path.join(model_path, 'Qwen-VL-visual')
mm_projector_type = "cross_attn"
mm_projector_path = os.path.join(model_path, 'mm_projector/mm_projector.bin')
disable_torch_init()
tokenizer, model, image_processor, context_len = load_mixed_pretrained_model(model_path, model_base, model_name,
vision_tower_path, mm_projector_type,
mm_projector_path, device_map="auto")
compiler = Compiler("/cm/CodeFuse-VLM-14B/CodeFuse-MFT-VLM/llava/serve/assets/web-dsl-mapping.json")
tokenizer.pad_token_id = tokenizer.eod_id
# model = model.cuda()
image_fn = "/cm/CodeFuse-VLM-14B/image_test_vlm.png"
image = Image.open(image_fn).convert('RGB')
image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
import pdb; pdb.set_trace()
# image.show()
def inference(prompt):
inputs = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
prompt = "<|im_start|>user\n" + "Picture 1:\n" + prompt + "<|im_end|>\n" + "<|im_start|>assistant\n"
inputs += prompt
tokens = tokenizer(
inputs,
max_length=tokenizer.model_max_length,
padding=True,
truncation=True,
return_tensors="pt",
)
input_ids = tokens.input_ids.cuda()
stop_str = tokenizer.pad_token
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=image_tensor.unsqueeze(0).cuda(),
do_sample=True,
temperature=0.2,
top_p=0.3,
top_k=0,
max_new_tokens=2048,
return_dict_in_generate=False,
use_cache=True)
input_token_len = input_ids.shape[1]
n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
if n_diff_input_output > 0:
print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
output_text = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
return output_text
ret = inference("Please generate DSL for the skect on this image:\n")
print(ret)
output_text = ret.replace("<|im_end|>", "").replace("<|im_start|>", "").replace("\n", "")
output_text = output_text.lower()
output_text = output_text.replace("{", "{\n").replace("}", "\n}\n").replace("\n\n", '\n').rstrip("\n")
output_html = compiler.compile(output_text, None, rendering_function=render_content_with_text)
print(output_html)then the model returns:
I'm sorry, but I cannot see the image you are referring to. Please provide a detailed description of the image or upload the image itself so that I can generate the DSL for you.
Metadata
Metadata
Assignees
Labels
No labels