|
| 1 | +import os |
1 | 2 | import unittest |
2 | 3 | from onnx_diagnostic.ext_test_case import ExtTestCase, never_test |
3 | 4 | from onnx_diagnostic.helpers import string_type |
@@ -540,6 +541,113 @@ def test_object_detection(self): |
540 | 541 | f"{round(score.item(), 3)} at location {box}" |
541 | 542 | ) |
542 | 543 |
|
| 544 | + @never_test() |
| 545 | + def test_qwen_image(self): |
| 546 | + # clear&&NEVERTEST=1 VIDEO=0 python _unittests/ut_tasks/try_tasks.py -k qwen_1 |
| 547 | + # https://huggingface.co/tiiuae/falcon-mamba-7b |
| 548 | + |
| 549 | + from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor |
| 550 | + from qwen_vl_utils import process_vision_info |
| 551 | + |
| 552 | + model = Qwen2_5_VLForConditionalGeneration.from_pretrained( |
| 553 | + "Qwen/Qwen2.5-VL-3B-Instruct", torch_dtype="auto", device_map="auto" |
| 554 | + ) |
| 555 | + |
| 556 | + # default processer |
| 557 | + processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct") |
| 558 | + |
| 559 | + video = int(os.environ.get("VIDEO", "0")) |
| 560 | + if video: |
| 561 | + # Messages containing a video url and a text query |
| 562 | + messages = [ |
| 563 | + { |
| 564 | + "role": "user", |
| 565 | + "content": [ |
| 566 | + { |
| 567 | + "type": "video", |
| 568 | + "video": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-VL/space_woaudio.mp4", |
| 569 | + }, |
| 570 | + {"type": "text", "text": "Describe this video."}, |
| 571 | + ], |
| 572 | + } |
| 573 | + ] |
| 574 | + |
| 575 | + text = processor.apply_chat_template( |
| 576 | + messages, tokenize=False, add_generation_prompt=True |
| 577 | + ) |
| 578 | + image_inputs, video_inputs, video_kwargs = process_vision_info( |
| 579 | + messages, return_video_kwargs=True |
| 580 | + ) |
| 581 | + inputs = processor( |
| 582 | + text=[text], |
| 583 | + images=image_inputs, |
| 584 | + videos=video_inputs, |
| 585 | + padding=True, |
| 586 | + return_tensors="pt", |
| 587 | + **video_kwargs, |
| 588 | + ) |
| 589 | + else: |
| 590 | + messages = [ |
| 591 | + { |
| 592 | + "role": "user", |
| 593 | + "content": [ |
| 594 | + { |
| 595 | + "type": "image", |
| 596 | + "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", |
| 597 | + }, |
| 598 | + { |
| 599 | + "type": "image", |
| 600 | + "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", |
| 601 | + }, |
| 602 | + {"type": "text", "text": "Describe these images."}, |
| 603 | + ], |
| 604 | + } |
| 605 | + ] |
| 606 | + |
| 607 | + # Preparation for inference |
| 608 | + text = processor.apply_chat_template( |
| 609 | + messages, tokenize=False, add_generation_prompt=True |
| 610 | + ) |
| 611 | + image_inputs, video_inputs = process_vision_info(messages) |
| 612 | + inputs = processor( |
| 613 | + text=[text], |
| 614 | + images=image_inputs, |
| 615 | + videos=video_inputs, |
| 616 | + padding=True, |
| 617 | + return_tensors="pt", |
| 618 | + ) |
| 619 | + inputs = inputs.to("cuda") |
| 620 | + |
| 621 | + # Inference: Generation of the output |
| 622 | + print() |
| 623 | + with steal_forward(model, with_min_max=True): |
| 624 | + generated_ids = model.generate(**inputs, max_new_tokens=128) |
| 625 | + generated_ids_trimmed = [ |
| 626 | + out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) |
| 627 | + ] |
| 628 | + output_text = processor.batch_decode( |
| 629 | + generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False |
| 630 | + ) |
| 631 | + print(output_text) |
| 632 | + |
| 633 | + """ |
| 634 | + ---- stolen forward for class Qwen2_5_VLForConditionalGeneration -- iteration 0 |
| 635 | + cache_position:T7s3602, |
| 636 | + past_key_values:DynamicCache(key_cache=#0[], value_cache=#0[]), |
| 637 | + input_ids:T7s1x3602, |
| 638 | + attention_mask:T7s1x3602 |
| 639 | + pixel_values:T1s14308x1176 |
| 640 | + image_grid_thw:T7s2x3 |
| 641 | + ---- stolen forward for class Qwen2_5_VLForConditionalGeneration -- iteration 1 |
| 642 | + cache_position:T7s1, |
| 643 | + past_key_values:DynamicCache(key_cache=#36[T16s1x2x3602x128,...], |
| 644 | + value_cache=#36[T16s1x2x3602x128,...]), |
| 645 | + input_ids:T7s1x1, |
| 646 | + attention_mask:T7s1x3603, |
| 647 | + pixel_values_videos:None, |
| 648 | + image_grid_thw:T7s2x3 |
| 649 | + """ |
| 650 | + |
543 | 651 |
|
544 | 652 | if __name__ == "__main__": |
545 | 653 | unittest.main(verbosity=2) |
0 commit comments