diff --git a/_unittests/ut_tasks/try_export.py b/_unittests/ut_tasks/try_export.py index bc1170df..75f7e173 100644 --- a/_unittests/ut_tasks/try_export.py +++ b/_unittests/ut_tasks/try_export.py @@ -46,20 +46,26 @@ def test_imagetext2text_qwen_2_5_vl_instruct_visual(self): from transformers import AutoModel, AutoProcessor - # model_id = "Qwen/Qwen2.5-VL-7B-Instruct" - model_id = "Qwen/Qwen2.5-VL-3B-Instruct" + model_id = "Qwen/Qwen2.5-VL-7B-Instruct" + # model_id = "Qwen/Qwen2.5-VL-3B-Instruct" if os.environ.get("PRETRAINED", ""): - model = AutoModel.from_pretrained(model_id, device_map="auto", dtype="auto").eval() + print("-- pretrained model") + model = AutoModel.from_pretrained( + model_id, device_map=device, dtype=torch_dtype, attn_implementation="sdpa" + ).eval() else: + print("-- random model") def _config_reduction(config, task): return { - "num_hidden_layers": 2, + # "num_hidden_layers": 2, "text_config": { "num_hidden_layers": 2, "layer_types": ["full_attention", "full_attention"], }, # "_attn_implementation": "flash_attention_2", + "_attn_implementation": "sdpa", + "dtype": "float16", } config_reduction = _config_reduction @@ -70,6 +76,7 @@ def _config_reduction(config, task): model = model.to(device).to(getattr(torch, dtype)) + print(f"-- config._attn_implementation={model.config._attn_implementation}") print(f"-- model.dtype={model.dtype}") print(f"-- model.device={model.device}") processor = AutoProcessor.from_pretrained(model_id, use_fast=True) diff --git a/_unittests/ut_tasks/try_tasks.py b/_unittests/ut_tasks/try_tasks.py index 7af72324..4fb73b23 100644 --- a/_unittests/ut_tasks/try_tasks.py +++ b/_unittests/ut_tasks/try_tasks.py @@ -1011,15 +1011,20 @@ def test_imagetext2text_qwen_2_5_vl_instruct(self): return_dict:bool ) """ - import transformers - from transformers import AutoModel, AutoProcessor + from transformers import AutoProcessor from qwen_vl_utils import process_vision_info - # model_id = "Qwen/Qwen2.5-VL-7B-Instruct" - model_id = "Qwen/Qwen2.5-VL-3B-Instruct" + model_id = "Qwen/Qwen2.5-VL-7B-Instruct" + # model_id = "Qwen/Qwen2.5-VL-3B-Instruct" if os.environ.get("PRETRAINED", ""): - model = AutoModel.from_pretrained(model_id, device_map="auto", dtype="auto").eval() + print("-- use pretrained model") + from transformers import Qwen2_5_VLForConditionalGeneration + + model = Qwen2_5_VLForConditionalGeneration.from_pretrained( + model_id, device_map="auto", dtype="auto", trust_remote_code=True + ).eval() else: + print("-- use dummy model") def config_reduction(config, task): return { @@ -1035,6 +1040,7 @@ def config_reduction(config, task): ) model = data["model"] + print(f"-- model type={type(model)}") print(f"-- model.device={model.device}") processor = AutoProcessor.from_pretrained(model_id, use_fast=True) print(f"-- processor={type(processor)}") @@ -1063,25 +1069,13 @@ def config_reduction(config, task): padding=True, return_tensors="pt", ) - inputs = inputs.to("cuda") - model = model.to("cuda").to(torch.bfloat16) + # model = model.to("cuda").to(torch.bfloat16) + # inputs = inputs.to("cuda") print(f"-- processor {type(processor)}") print(f"-- inputs={self.string_type(inputs, with_shape=True, with_min_max=True)}") - - f_ = transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.apply_multimodal_rotary_pos_emb - - def _apply_multimodal_rotary_pos_emb(*args, **kwargs): - print( - "-- apply_multimodal_rotary_pos_emb:", - self.string_type(args, with_shape=True), - self.string_type(kwargs, with_shape=True), - ) - return f_(*args, **kwargs) - - transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.apply_multimodal_rotary_pos_emb = ( - _apply_multimodal_rotary_pos_emb - ) + generated_ids = model.generate(**inputs, max_new_tokens=128) + print("-- second") print() with ( diff --git a/onnx_diagnostic/helpers/mini_onnx_builder.py b/onnx_diagnostic/helpers/mini_onnx_builder.py index 2d727259..648155a2 100644 --- a/onnx_diagnostic/helpers/mini_onnx_builder.py +++ b/onnx_diagnostic/helpers/mini_onnx_builder.py @@ -422,6 +422,27 @@ def create_onnx_model_from_input_tensors( :return: ModelProto The function raises an error if not supported. + An example: + + .. code-block:: python + + from onnx_diagnostic.helpers.mini_onnx_builder import ( + create_onnx_model_from_input_tensors, + ) + import onnx + + proto = create_onnx_model_from_input_tensors( + dict( + query_states=query_states, + key_states=key_states, + value_states=value_states, + cu_seqlens=cu_seqlens, + max_seqlen=(cu_seqlens[1:] - cu_seqlens[:-1]).max(), + scaling=self.scaling, + attn_output=attn_output, + ) + ) + onnx.save(proto, "attention_inputs.onnx") """ if switch_low_high is None: switch_low_high = sys.byteorder != "big" @@ -461,7 +482,17 @@ def _unflatten( if spl[-1] == "array": return pos + 1, outputs[pos] if spl[-1] == "tensor": - return pos + 1, torch.from_numpy(outputs[pos]).to(device) + try: + return pos + 1, torch.from_numpy(outputs[pos]).to(device) + except TypeError: + # it shuold be more robusts + import ml_dtypes + + if outputs[pos].dtype == ml_dtypes.bfloat16: + return pos + 1, torch.from_numpy(outputs[pos].astype(float)).to(device).to( + torch.bfloat16 + ) + raise raise AssertionError(f"Unexpected name {name!r} in {names}") res: List[Any] = [] @@ -557,6 +588,19 @@ def create_input_tensors_from_onnx_model( :return: restored data See example :ref:`l-plot-intermediate-results` for an example. + + .. code-bloc:: python + + import os + from onnx_diagnostic.helpers.mini_onnx_builder import ( + create_input_tensors_from_onnx_model, + ) + from onnx_diagnostic.helpers import string_type + + restored = create_input_tensors_from_onnx_model("attention_inputs.onnx") + for k, v in restored.items(): + print(f"{k}: {string_type(v, with_shape=True, with_min_max=True)}") + """ if engine == "ExtendedReferenceEvaluator": from ..reference import ExtendedReferenceEvaluator