Skip to content

tensorrt produces wrong results for a onnx model with only the Mean operator on GPU RTX 3080 #4509

@coffezhou

Description

@coffezhou

Description

For the following simple onnx model,
Image
the results produced by onnxruntime are as follows:

[array([[[[-0.625  , -0.597  ,  0.139  , ..., -0.384  , -0.2969 ,
          -2.113  ],
         [-1.257  ,  0.1921 , -0.4863 , ...,  0.921  ,  1.142  ,
          -1.792  ],
         [ 1.127  , -0.843  ,  1.876  , ..., -0.4797 ,  0.0273 ,
          -0.5825 ],
         ...,
         [ 0.5156 ,  0.388  ,  0.355  , ...,  1.508  ,  1.266  ,
          -1.848  ],
         [-0.273  , -0.03482,  2.162  , ..., -0.1318 , -0.403  ,
          -0.412  ],
         [-1.791  , -0.2048 , -1.652  , ...,  0.67   , -0.2468 ,
          -1.362  ]],

        [[-0.4944 ,  0.7485 , -0.1753 , ..., -1.611  ,  0.0641 ,
           0.807  ],
         [ 0.529  , -0.7485 , -0.256  , ...,  0.755  ,  0.8496 ,
           0.5527 ],
         [ 1.542  ,  1.063  ,  1.591  , ...,  0.8154 ,  0.4368 ,
           0.502  ],
         ...,
         [-1.313  , -0.1088 ,  1.958  , ...,  0.7207 , -0.0293 ,
           0.2233 ],
         [-0.521  ,  1.016  , -1.759  , ...,  0.117  , -2.307  ,
           0.0855 ],
         [ 1.646  ,  0.3445 ,  1.83   , ..., -0.05853,  1.348  ,
           0.1423 ]],

        [[-0.4802 , -0.4946 , -0.06354, ...,  0.728  ,  0.7437 ,
           0.407  ],
         [-0.6484 , -1.324  , -1.111  , ...,  1.911  ,  0.1493 ,
          -0.3125 ],
         [-0.495  , -0.588  , -1.793  , ...,  0.9346 , -0.6177 ,
          -2.043  ],
         ...,
         [ 0.3616 , -1.132  ,  0.4253 , ...,  0.3723 , -2.402  ,
          -1.751  ],
         [-0.0389 ,  1.895  , -0.00837, ...,  0.7773 , -2.025  ,
          -1.092  ],
         [ 0.3916 ,  0.3914 , -0.2249 , ...,  0.525  , -0.5117 ,
           1.433  ]]]], shape=(1, 3, 32, 32), dtype=float16)]

However, when I run it using tensorrt, the results are as follows:

[array([[[[-0., -0.,  0., ..., -0., -0., -0.],
         [-0.,  0., -0., ...,  0.,  0., -0.],
         [ 0., -0.,  0., ..., -0.,  0., -0.],
         ...,
         [ 0.,  0.,  0., ...,  0.,  0., -0.],
         [-0., -0.,  0., ..., -0., -0., -0.],
         [-0., -0., -0., ...,  0., -0., -0.]],

        [[-0.,  0., -0., ..., -0.,  0.,  0.],
         [ 0., -0., -0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         ...,
         [-0., -0.,  0., ...,  0., -0.,  0.],
         [-0.,  0., -0., ...,  0., -0.,  0.],
         [ 0.,  0.,  0., ..., -0.,  0.,  0.]],

        [[-0., -0., -0., ...,  0.,  0.,  0.],
         [-0., -0., -0., ...,  0.,  0., -0.],
         [-0., -0., -0., ...,  0., -0., -0.],
         ...,
         [ 0., -0.,  0., ...,  0., -0., -0.],
         [-0.,  0., -0., ...,  0., -0., -0.],
         [ 0.,  0., -0., ...,  0., -0.,  0.]]]],
      shape=(1, 3, 32, 32), dtype=float16)]

It is very strange. The data type of input is float16. Dose my GPU RTX 3080 not support float16?

Environment

TensorRT Version: 10.12.0.36

NVIDIA GPU: GeForce RTX 3080

NVIDIA Driver Version: 535.183.01

CUDA Version: 12.2

CUDNN Version: none

Operating System: ubuntu 20.04

Python Version (if applicable): 3.12.9

Relevant Files

Model link:

Steps To Reproduce

This issue can be reproduced by the following code with the model in the attachment.

from typing import Dict, List, Literal, Optional
import sys
import os

import numpy as np
import onnx
import onnxruntime

import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit

import argparse
import pickle


def test():
    onnx_model = onnx.load('11.onnx')
    
    with open("inputs.pkl", "rb") as fp:
        inputs = pickle.load(fp)

    try:
        ort_session = onnxruntime.InferenceSession(
            onnx_model.SerializeToString(), providers=["CPUExecutionProvider"]
        )
        ort_output = ort_session.run([], inputs)
    except Exception as e:
        print(e)
        print("This model cannot be executed by onnxruntime!")
        sys.exit(1)
    
    print("ONNXRuntime:\n", ort_output)
    
    #--------------------------------------------------------
        
    trt_logger = trt.Logger(trt.Logger.WARNING)
    trt.init_libnvinfer_plugins(trt_logger, '')
    builder = trt.Builder(trt_logger)
    #network = builder.create_network()
    network = builder.create_network(flags=1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))

    parser = trt.OnnxParser(network, trt_logger)
    with open('11.onnx', 'rb') as model_file:
        if not parser.parse(model_file.read()):
            for error in range(parser.num_errors):
                print(parser.get_error(error))
            sys.exit(1)
    
    config = builder.create_builder_config()
    serialized_engine = builder.build_serialized_network(network, config)
    
    if serialized_engine == None:
        sys.exit(1)
    
    with open("engine.trt", "wb") as f:
        f.write(serialized_engine)
        
    with open("engine.trt", "rb") as f, trt.Runtime(trt_logger) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())
        
    context = engine.create_execution_context()

    inputs_trt, outputs_trt, bindings = [], [], []
    stream = cuda.Stream()
    input_name = []
    output_shape_dtype = []
    #------------------------------------------------------------
    for binding in engine:
        size = trt.volume(engine.get_tensor_shape(binding))
        dtype = trt.nptype(engine.get_tensor_dtype(binding))
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        bindings.append({'name':binding, 'address':int(device_mem)})
        
        if engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
            inputs_trt.append({'host': host_mem, 'device': device_mem})
            input_name.append(binding)
        else:
            outputs_trt.append({'host': host_mem, 'device': device_mem})
            output_shape = engine.get_tensor_shape(binding)
            output_shape_dtype.append({'shape':output_shape, 'dtype':dtype})

    for i, input_mem in enumerate(inputs_trt):
        inp = np.ravel(inputs[input_name[i]])
        np.copyto(input_mem['host'], inp)
        cuda.memcpy_htod_async(input_mem['device'], input_mem['host'], stream)

    for bind in bindings:
        name = bind['name']
        addr = bind['address']
        context.set_tensor_address(name, addr)
    
    context.execute_async_v3(stream_handle=stream.handle)
    
    trt_output = []
    for i, output_mem in enumerate(outputs_trt):
        cuda.memcpy_dtoh_async(output_mem['host'], output_mem['device'], stream)
        out_shape = output_shape_dtype[i]['shape']
        out = output_mem['host'].reshape(out_shape)
        trt_output.append(out)

    stream.synchronize()
    
    print(trt_output)
    
    assert len(ort_output) == len(trt_output), "Unequal number of outputs"
    
    np.testing.assert_allclose(trt_output[0], ort_output[0], rtol=0.1, atol=0.1)
        

    
if __name__ == "__main__":
    test()
    

testcase.zip

Commands or scripts:

Have you tried the latest release?: yes

Can this model run on other frameworks? For example run ONNX model with ONNXRuntime (polygraphy run <model.onnx> --onnxrt): the mode can be executed by onnxruntime.

Metadata

Metadata

Assignees

No one assigned

    Labels

    Module:AccuracyOutput mismatch between TensorRT and other frameworksModule:ONNXIssues relating to ONNX usage and import

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions