Qwen2.5vl-3b与转onnx后的模型结果对不上

大佬，再请教个问题，背景是我只将onnx_b中vit之前的部分转onnx，且改为动态输入，但是转出的onnx结果与model_B的结果对不上，下面是代码，能否帮看下


import sys
import os
sys.path.append(os.path.abspath("./qwen-vl-utils/src"))
import requests
from PIL import Image
import torch
import triton.language as tl
from transformers import (
    AutoProcessor,
    Qwen2_5_VLForConditionalGeneration,
)
import numpy as np
from qwen_vl_utils import process_vision_info


VISION_MODEL_NAME = "qwen2_5_vit/onnx/vision_encoder.onnx"

class QwenVL_PartB(torch.nn.Module):
    def __init__(self, qwenvl):
        super(QwenVL_PartB, self).__init__()
        self.qwenvl = qwenvl
        
        self.num_heads = self.qwenvl.config.vision_config.num_heads
        self.head_dim = self.qwenvl.config.vision_config.hidden_size // self.num_heads
        self.head_dim_half = self.head_dim // 2
        self.variance_epsilon = float(1e-6)
        self.patch_size = self.qwenvl.visual.patch_size
        self.merge_size = self.qwenvl.visual.spatial_merge_size
        self.means = torch.tensor([0.48145466, 0.4578275, 0.40821073], dtype=torch.float32).view(1, 3, 1, 1)
        self.inv_std = torch.tensor([1.0 / 0.26862954, 1.0 / 0.26130258, 1.0 / 0.27577711], dtype=torch.float32).view(1, 3, 1, 1)
        self.means_inv_std = self.means * self.inv_std
        self.inv_255_std = self.inv_std / 255.0
        

        

        self.scale_factor = float(self.head_dim ** -0.25)
        
        

    def forward(self, pixel_values,grid_thw):


        self.width_factor = grid_thw[0,2]//2
        self.height_factor = grid_thw[0,1]//2
        self.factor_size = self.width_factor * self.height_factor * self.merge_size * self.merge_size#


        rotary_pos_emb = self.qwenvl.visual.rot_pos_emb(grid_thw).float().unsqueeze(0)#
        cos = rotary_pos_emb.cos()
        sin = rotary_pos_emb.sin()
        self.rotary_pos_emb_cos = torch.cat([cos, cos], dim=-1).transpose(0, 1)
        self.rotary_pos_emb_sin = torch.cat([sin, sin], dim=-1).transpose(0, 1)
        init_attention_mask = torch.ones([1, self.factor_size, self.factor_size], dtype=torch.int8)#
        _, cu_window_seqlens = self.qwenvl.visual.get_window_index(grid_thw)#
        cu_window_seqlens = torch.tensor(cu_window_seqlens, dtype=torch.int64)#
        cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)#
        cu_seqlens = torch.tensor([0, self.factor_size], dtype=torch.int64)#
        self.attention_mask = []
        for layer_num, blk in enumerate(self.qwenvl.visual.blocks):
            blk.attn.qkv.weight.data[:-self.qwenvl.visual.patch_embed.embed_dim] *= self.scale_factor
            blk.attn.qkv.bias.data[:-self.qwenvl.visual.patch_embed.embed_dim] *= self.scale_factor
            if layer_num in self.qwenvl.visual.fullatt_block_indexes:
                cu_seqlens_now = cu_seqlens
            else:
                cu_seqlens_now = cu_window_seqlens
            attention_mask = init_attention_mask
            for i in range(1, len(cu_seqlens_now)):
                attention_mask[..., cu_seqlens_now[i - 1]: cu_seqlens_now[i], cu_seqlens_now[i - 1]: cu_seqlens_now[i]] = 0
            self.attention_mask.append((attention_mask * -128.0).float())
            
        pixel_values = torch.nn.functional.interpolate(
            pixel_values.float(),
            [self.height_factor * 28, self.width_factor * 28],
            mode='bilinear',
            align_corners=True)
        pixel_values = pixel_values * self.inv_255_std - self.means_inv_std
        pixel_values = torch.cat([pixel_values, pixel_values], dim=0)
        pixel_values = pixel_values.reshape(
            self.qwenvl.visual.patch_embed.temporal_patch_size,
            3,
            self.height_factor,
            self.merge_size,
            self.patch_size,
            self.width_factor,
            self.merge_size,
            self.patch_size
        )
        pixel_values = pixel_values.permute(2, 5, 3, 6, 1, 0, 4, 7)
        pixel_values = pixel_values.reshape(
            self.factor_size,
            3,
            self.qwenvl.visual.patch_embed.temporal_patch_size,
            self.patch_size,
            self.patch_size
        )
        vision_hidden_states = self.qwenvl.visual.patch_embed.proj(pixel_values.to(self.qwenvl.device)).view(1, -1, self.qwenvl.visual.patch_embed.embed_dim)
        return vision_hidden_states


model_path = "/ofs/icvs/lijian/Models/Qwen2.5-VL-7B-Instruct"
#model_path = "/ofs/icvs/lijian/Models/Qwen2.5-VL-3B-Instruct"
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path,device_map="auto" ,attn_implementation="eager").eval()  # eager
url = '147.png'
conversation = [
    {
        "role": "user",
        "content": [
            { "type": "image","image":url },
            { "type": "text", "text": "Describe this image."},
        ],
    },
]

images, video_inputs = process_vision_info(conversation)


image = images[0]
grid_thw = torch.tensor([[1, image.height // 14, image.width // 14]], dtype=torch.int64)
pixel_values = np.transpose(np.array(image).astype(np.float32), (2, 0, 1))
pixel_values = torch.from_numpy(np.expand_dims(pixel_values, axis=0))

model_B = QwenVL_PartB(model)#.to(model.device)
vision_hidden_states = model_B(pixel_values,grid_thw)
vision_hidden_states.cpu().detach().numpy().tofile('147_pt.raw')

torch.onnx.export(
    model_B,
    (pixel_values,grid_thw),
    f=VISION_MODEL_NAME,
    export_params=True,
    opset_version=14,
    do_constant_folding=True,
    input_names=['pixel_values',"grid_thw"],
    output_names=['vision_hidden_states'],
    # external_data_format=True,
    dynamic_axes={
        "pixel_values": {
            0: "batch_size",
            2: "height",
            3: "width",
        },
        "grid_thw": {0: "batch_size"},
        "vision_hidden_states": {0: "batch_size",1: "4 * WIDTH_FACTOR * HEIGHT_FACTOR"},
    },
)

import onnx,onnxruntime

ort_session_B = onnxruntime.InferenceSession(VISION_MODEL_NAME,  providers=['CPUExecutionProvider'])
in_name_B = ort_session_B.get_inputs()
out_name_B = ort_session_B.get_outputs()
in_name_B0 = in_name_B[0].name
in_name_B1 = in_name_B[1].name
out_name_B0 = out_name_B[0].name
vision_hidden_states = ort_session_B.run_with_ort_values([out_name_B0], {in_name_B0: onnxruntime.OrtValue.ortvalue_from_numpy(pixel_values.numpy(), 'cpu', 0),in_name_B1: onnxruntime.OrtValue.ortvalue_from_numpy(grid_thw.cpu().detach().numpy(), 'cpu', 1)},)[0]    
onnxruntime.OrtValue.numpy(vision_hidden_states)[0].tofile('147_onnx.raw')#(624, 1280)


data_pt = np.fromfile('147_pt.raw')
data_onnx = np.fromfile('147_onnx.raw')

print(data_pt)
print(data_onnx,)
print("最大误差:", np.max(np.abs(data_onnx - data_pt)))
#########下面是结果
[-3.05445570e-17 -1.03013384e-14 -1.04393472e-20 ...  5.01648082e-06
 -1.21081098e-01 -3.39158247e-29]
[-2.99890092e-17 -1.04578091e-14 -1.01721291e-20 ...  5.03944010e-06
 -1.21179565e-01 -6.27045904e-28]
最大误差: 78.24999971955549

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Qwen2.5vl-3b与转onnx后的模型结果对不上 #40

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Qwen2.5vl-3b与转onnx后的模型结果对不上 #40

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions