|
1 | 1 | import os |
2 | 2 | import unittest |
3 | 3 | import torch |
4 | | -from onnx_diagnostic.ext_test_case import ExtTestCase, never_test |
| 4 | +from onnx_diagnostic.ext_test_case import ExtTestCase, never_test, ignore_warnings |
5 | 5 | from onnx_diagnostic.helpers import string_type |
6 | 6 | from onnx_diagnostic.helpers.cache_helper import make_dynamic_cache, make_encoder_decoder_cache |
7 | 7 | from onnx_diagnostic.helpers.torch_helper import steal_forward |
8 | 8 | from onnx_diagnostic.torch_export_patches import torch_export_patches |
9 | 9 | from onnx_diagnostic.torch_models.hghub.model_inputs import get_untrained_model_with_inputs |
10 | 10 |
|
11 | 11 |
|
12 | | -class TestHuggingFaceHubModel(ExtTestCase): |
| 12 | +class TestTryHuggingFaceHubModel(ExtTestCase): |
13 | 13 | @never_test() |
14 | 14 | def test_image_classification(self): |
15 | 15 | # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k image_c |
@@ -988,6 +988,110 @@ def test_imagetext2text_generation_gemma3_4b_it(self): |
988 | 988 | ) |
989 | 989 | print(output_text) |
990 | 990 |
|
| 991 | + @never_test() |
| 992 | + @ignore_warnings(UserWarning) |
| 993 | + def test_imagetext2text_qwen_2_5_vl_instruct(self): |
| 994 | + """ |
| 995 | + clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k qwen_2_5 |
| 996 | +
|
| 997 | + :: |
| 998 | +
|
| 999 | + kwargs=dict( |
| 1000 | + cache_position:T7s3602, |
| 1001 | + input_ids:T7s1x3602, |
| 1002 | + inputs_embeds:None, |
| 1003 | + attention_mask:T7s1x3602, |
| 1004 | + position_ids:T7s4x1x3602, |
| 1005 | + pixel_values:T1s14308x1176, |
| 1006 | + pixel_values_videos:None, |
| 1007 | + image_grid_thw:T7s1x3, |
| 1008 | + video_grid_thw:None, |
| 1009 | + second_per_grid_ts:None, |
| 1010 | + use_cache:bool, |
| 1011 | + return_dict:bool |
| 1012 | + ) |
| 1013 | + """ |
| 1014 | + from transformers import AutoModel, AutoProcessor |
| 1015 | + from qwen_vl_utils import process_vision_info |
| 1016 | + |
| 1017 | + # model_id = "Qwen/Qwen2.5-VL-7B-Instruct" |
| 1018 | + model_id = "Qwen/Qwen2.5-VL-3B-Instruct" |
| 1019 | + if os.environ.get("PRETRAINED", ""): |
| 1020 | + model = AutoModel.from_pretrained(model_id, device_map="auto", dtype="auto").eval() |
| 1021 | + else: |
| 1022 | + |
| 1023 | + def config_reduction(config, task): |
| 1024 | + return { |
| 1025 | + "num_hidden_layers": 2, |
| 1026 | + "text_config": { |
| 1027 | + "num_hidden_layers": 2, |
| 1028 | + "layer_types": ["full_attention", "full_attention"], |
| 1029 | + }, |
| 1030 | + } |
| 1031 | + |
| 1032 | + data = get_untrained_model_with_inputs( |
| 1033 | + model_id, verbose=1, add_second_input=False, config_reduction=config_reduction |
| 1034 | + ) |
| 1035 | + model = data["model"] |
| 1036 | + |
| 1037 | + print(f"-- model.device={model.device}") |
| 1038 | + processor = AutoProcessor.from_pretrained(model_id, use_fast=True) |
| 1039 | + print(f"-- processor={type(processor)}") |
| 1040 | + |
| 1041 | + messages = [ |
| 1042 | + { |
| 1043 | + "role": "user", |
| 1044 | + "content": [ |
| 1045 | + { |
| 1046 | + "type": "image", |
| 1047 | + "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", |
| 1048 | + }, |
| 1049 | + {"type": "text", "text": "Describe this image."}, |
| 1050 | + ], |
| 1051 | + } |
| 1052 | + ] |
| 1053 | + text = processor.apply_chat_template( |
| 1054 | + messages, tokenize=False, add_generation_prompt=True |
| 1055 | + ) |
| 1056 | + image_inputs, video_inputs = process_vision_info(messages) |
| 1057 | + inputs = processor( |
| 1058 | + text=[text], |
| 1059 | + images=image_inputs, |
| 1060 | + videos=video_inputs, |
| 1061 | + padding=True, |
| 1062 | + return_tensors="pt", |
| 1063 | + ) |
| 1064 | + inputs = inputs.to("cuda") |
| 1065 | + model = model.to("cuda").to(torch.bfloat16) |
| 1066 | + |
| 1067 | + print(f"-- processor {type(processor)}") |
| 1068 | + print(f"-- inputs={self.string_type(inputs, with_shape=True, with_min_max=True)}") |
| 1069 | + |
| 1070 | + print() |
| 1071 | + with ( |
| 1072 | + torch_export_patches( |
| 1073 | + patch_torch=False, |
| 1074 | + patch_sympy=False, |
| 1075 | + patch_transformers=True, |
| 1076 | + verbose=1, |
| 1077 | + ), |
| 1078 | + steal_forward( |
| 1079 | + [model, model.visual], |
| 1080 | + dump_file=self.get_dump_file("test_imagetext2text_qwen_2_5_vl_instruct.onnx"), |
| 1081 | + dump_drop={"attention_mask", "past_key_values", "pixel_values"}, |
| 1082 | + save_as_external_data=False, |
| 1083 | + with_shapes=True, |
| 1084 | + ), |
| 1085 | + ): |
| 1086 | + generated_ids = model.generate(**inputs, max_new_tokens=128) |
| 1087 | + generated_ids_trimmed = [ |
| 1088 | + out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) |
| 1089 | + ] |
| 1090 | + output_text = processor.batch_decode( |
| 1091 | + generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False |
| 1092 | + ) |
| 1093 | + print(output_text) |
| 1094 | + |
991 | 1095 |
|
992 | 1096 | if __name__ == "__main__": |
993 | 1097 | unittest.main(verbosity=2) |
0 commit comments