Skip to content

Commit b5b0823

Browse files
committed
update the example
1 parent 16fddf2 commit b5b0823

File tree

1 file changed

+108
-0
lines changed

1 file changed

+108
-0
lines changed

_unittests/ut_tasks/try_tasks.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
import unittest
23
from onnx_diagnostic.ext_test_case import ExtTestCase, never_test
34
from onnx_diagnostic.helpers import string_type
@@ -540,6 +541,113 @@ def test_object_detection(self):
540541
f"{round(score.item(), 3)} at location {box}"
541542
)
542543

544+
@never_test()
545+
def test_qwen_image(self):
546+
# clear&&NEVERTEST=1 VIDEO=0 python _unittests/ut_tasks/try_tasks.py -k qwen_1
547+
# https://huggingface.co/tiiuae/falcon-mamba-7b
548+
549+
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
550+
from qwen_vl_utils import process_vision_info
551+
552+
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
553+
"Qwen/Qwen2.5-VL-3B-Instruct", torch_dtype="auto", device_map="auto"
554+
)
555+
556+
# default processer
557+
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")
558+
559+
video = int(os.environ.get("VIDEO", "0"))
560+
if video:
561+
# Messages containing a video url and a text query
562+
messages = [
563+
{
564+
"role": "user",
565+
"content": [
566+
{
567+
"type": "video",
568+
"video": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-VL/space_woaudio.mp4",
569+
},
570+
{"type": "text", "text": "Describe this video."},
571+
],
572+
}
573+
]
574+
575+
text = processor.apply_chat_template(
576+
messages, tokenize=False, add_generation_prompt=True
577+
)
578+
image_inputs, video_inputs, video_kwargs = process_vision_info(
579+
messages, return_video_kwargs=True
580+
)
581+
inputs = processor(
582+
text=[text],
583+
images=image_inputs,
584+
videos=video_inputs,
585+
padding=True,
586+
return_tensors="pt",
587+
**video_kwargs,
588+
)
589+
else:
590+
messages = [
591+
{
592+
"role": "user",
593+
"content": [
594+
{
595+
"type": "image",
596+
"image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
597+
},
598+
{
599+
"type": "image",
600+
"image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
601+
},
602+
{"type": "text", "text": "Describe these images."},
603+
],
604+
}
605+
]
606+
607+
# Preparation for inference
608+
text = processor.apply_chat_template(
609+
messages, tokenize=False, add_generation_prompt=True
610+
)
611+
image_inputs, video_inputs = process_vision_info(messages)
612+
inputs = processor(
613+
text=[text],
614+
images=image_inputs,
615+
videos=video_inputs,
616+
padding=True,
617+
return_tensors="pt",
618+
)
619+
inputs = inputs.to("cuda")
620+
621+
# Inference: Generation of the output
622+
print()
623+
with steal_forward(model, with_min_max=True):
624+
generated_ids = model.generate(**inputs, max_new_tokens=128)
625+
generated_ids_trimmed = [
626+
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
627+
]
628+
output_text = processor.batch_decode(
629+
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
630+
)
631+
print(output_text)
632+
633+
"""
634+
---- stolen forward for class Qwen2_5_VLForConditionalGeneration -- iteration 0
635+
cache_position:T7s3602,
636+
past_key_values:DynamicCache(key_cache=#0[], value_cache=#0[]),
637+
input_ids:T7s1x3602,
638+
attention_mask:T7s1x3602
639+
pixel_values:T1s14308x1176
640+
image_grid_thw:T7s2x3
641+
---- stolen forward for class Qwen2_5_VLForConditionalGeneration -- iteration 1
642+
cache_position:T7s1,
643+
past_key_values:DynamicCache(key_cache=#36[T16s1x2x3602x128,...],
644+
value_cache=#36[T16s1x2x3602x128,...]),
645+
input_ids:T7s1x1,
646+
attention_mask:T7s1x3603,
647+
pixel_values_videos:None,
648+
image_grid_thw:T7s2x3
649+
"""
650+
543651

544652
if __name__ == "__main__":
545653
unittest.main(verbosity=2)

0 commit comments

Comments
 (0)