Skip to content

Qwen2.5vl-3b与转onnx后的模型结果对不上 #40

@jianbaba

Description

@jianbaba

大佬,再请教个问题,背景是我只将onnx_b中vit之前的部分转onnx,且改为动态输入,但是转出的onnx结果与model_B的结果对不上,下面是代码,能否帮看下

import sys
import os
sys.path.append(os.path.abspath("./qwen-vl-utils/src"))
import requests
from PIL import Image
import torch
import triton.language as tl
from transformers import (
AutoProcessor,
Qwen2_5_VLForConditionalGeneration,
)
import numpy as np
from qwen_vl_utils import process_vision_info

VISION_MODEL_NAME = "qwen2_5_vit/onnx/vision_encoder.onnx"

class QwenVL_PartB(torch.nn.Module):
def init(self, qwenvl):
super(QwenVL_PartB, self).init()
self.qwenvl = qwenvl

    self.num_heads = self.qwenvl.config.vision_config.num_heads
    self.head_dim = self.qwenvl.config.vision_config.hidden_size // self.num_heads
    self.head_dim_half = self.head_dim // 2
    self.variance_epsilon = float(1e-6)
    self.patch_size = self.qwenvl.visual.patch_size
    self.merge_size = self.qwenvl.visual.spatial_merge_size
    self.means = torch.tensor([0.48145466, 0.4578275, 0.40821073], dtype=torch.float32).view(1, 3, 1, 1)
    self.inv_std = torch.tensor([1.0 / 0.26862954, 1.0 / 0.26130258, 1.0 / 0.27577711], dtype=torch.float32).view(1, 3, 1, 1)
    self.means_inv_std = self.means * self.inv_std
    self.inv_255_std = self.inv_std / 255.0
    

    

    self.scale_factor = float(self.head_dim ** -0.25)
    
    

def forward(self, pixel_values,grid_thw):


    self.width_factor = grid_thw[0,2]//2
    self.height_factor = grid_thw[0,1]//2
    self.factor_size = self.width_factor * self.height_factor * self.merge_size * self.merge_size#


    rotary_pos_emb = self.qwenvl.visual.rot_pos_emb(grid_thw).float().unsqueeze(0)#
    cos = rotary_pos_emb.cos()
    sin = rotary_pos_emb.sin()
    self.rotary_pos_emb_cos = torch.cat([cos, cos], dim=-1).transpose(0, 1)
    self.rotary_pos_emb_sin = torch.cat([sin, sin], dim=-1).transpose(0, 1)
    init_attention_mask = torch.ones([1, self.factor_size, self.factor_size], dtype=torch.int8)#
    _, cu_window_seqlens = self.qwenvl.visual.get_window_index(grid_thw)#
    cu_window_seqlens = torch.tensor(cu_window_seqlens, dtype=torch.int64)#
    cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)#
    cu_seqlens = torch.tensor([0, self.factor_size], dtype=torch.int64)#
    self.attention_mask = []
    for layer_num, blk in enumerate(self.qwenvl.visual.blocks):
        blk.attn.qkv.weight.data[:-self.qwenvl.visual.patch_embed.embed_dim] *= self.scale_factor
        blk.attn.qkv.bias.data[:-self.qwenvl.visual.patch_embed.embed_dim] *= self.scale_factor
        if layer_num in self.qwenvl.visual.fullatt_block_indexes:
            cu_seqlens_now = cu_seqlens
        else:
            cu_seqlens_now = cu_window_seqlens
        attention_mask = init_attention_mask
        for i in range(1, len(cu_seqlens_now)):
            attention_mask[..., cu_seqlens_now[i - 1]: cu_seqlens_now[i], cu_seqlens_now[i - 1]: cu_seqlens_now[i]] = 0
        self.attention_mask.append((attention_mask * -128.0).float())
        
    pixel_values = torch.nn.functional.interpolate(
        pixel_values.float(),
        [self.height_factor * 28, self.width_factor * 28],
        mode='bilinear',
        align_corners=True)
    pixel_values = pixel_values * self.inv_255_std - self.means_inv_std
    pixel_values = torch.cat([pixel_values, pixel_values], dim=0)
    pixel_values = pixel_values.reshape(
        self.qwenvl.visual.patch_embed.temporal_patch_size,
        3,
        self.height_factor,
        self.merge_size,
        self.patch_size,
        self.width_factor,
        self.merge_size,
        self.patch_size
    )
    pixel_values = pixel_values.permute(2, 5, 3, 6, 1, 0, 4, 7)
    pixel_values = pixel_values.reshape(
        self.factor_size,
        3,
        self.qwenvl.visual.patch_embed.temporal_patch_size,
        self.patch_size,
        self.patch_size
    )
    vision_hidden_states = self.qwenvl.visual.patch_embed.proj(pixel_values.to(self.qwenvl.device)).view(1, -1, self.qwenvl.visual.patch_embed.embed_dim)
    return vision_hidden_states

model_path = "/ofs/icvs/lijian/Models/Qwen2.5-VL-7B-Instruct"
#model_path = "/ofs/icvs/lijian/Models/Qwen2.5-VL-3B-Instruct"
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path,device_map="auto" ,attn_implementation="eager").eval() # eager
url = '147.png'
conversation = [
{
"role": "user",
"content": [
{ "type": "image","image":url },
{ "type": "text", "text": "Describe this image."},
],
},
]

images, video_inputs = process_vision_info(conversation)

image = images[0]
grid_thw = torch.tensor([[1, image.height // 14, image.width // 14]], dtype=torch.int64)
pixel_values = np.transpose(np.array(image).astype(np.float32), (2, 0, 1))
pixel_values = torch.from_numpy(np.expand_dims(pixel_values, axis=0))

model_B = QwenVL_PartB(model)#.to(model.device)
vision_hidden_states = model_B(pixel_values,grid_thw)
vision_hidden_states.cpu().detach().numpy().tofile('147_pt.raw')

torch.onnx.export(
model_B,
(pixel_values,grid_thw),
f=VISION_MODEL_NAME,
export_params=True,
opset_version=14,
do_constant_folding=True,
input_names=['pixel_values',"grid_thw"],
output_names=['vision_hidden_states'],
# external_data_format=True,
dynamic_axes={
"pixel_values": {
0: "batch_size",
2: "height",
3: "width",
},
"grid_thw": {0: "batch_size"},
"vision_hidden_states": {0: "batch_size",1: "4 * WIDTH_FACTOR * HEIGHT_FACTOR"},
},
)

import onnx,onnxruntime

ort_session_B = onnxruntime.InferenceSession(VISION_MODEL_NAME, providers=['CPUExecutionProvider'])
in_name_B = ort_session_B.get_inputs()
out_name_B = ort_session_B.get_outputs()
in_name_B0 = in_name_B[0].name
in_name_B1 = in_name_B[1].name
out_name_B0 = out_name_B[0].name
vision_hidden_states = ort_session_B.run_with_ort_values([out_name_B0], {in_name_B0: onnxruntime.OrtValue.ortvalue_from_numpy(pixel_values.numpy(), 'cpu', 0),in_name_B1: onnxruntime.OrtValue.ortvalue_from_numpy(grid_thw.cpu().detach().numpy(), 'cpu', 1)},)[0]
onnxruntime.OrtValue.numpy(vision_hidden_states)[0].tofile('147_onnx.raw')#(624, 1280)

data_pt = np.fromfile('147_pt.raw')
data_onnx = np.fromfile('147_onnx.raw')

print(data_pt)
print(data_onnx,)
print("最大误差:", np.max(np.abs(data_onnx - data_pt)))
#########下面是结果
[-3.05445570e-17 -1.03013384e-14 -1.04393472e-20 ... 5.01648082e-06
-1.21081098e-01 -3.39158247e-29]
[-2.99890092e-17 -1.04578091e-14 -1.01721291e-20 ... 5.03944010e-06
-1.21179565e-01 -6.27045904e-28]
最大误差: 78.24999971955549

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions