Skip to content

TensorRT model always return NaN output #3952

@OctaAIVision

Description

@OctaAIVision

Description

I'm trying to run superglue with TensorRT,it returns dynamic shape output.It's run without any error but the result is always NaN.When i set my output shape upper than expected shape i got NaN for that expected shape and 0.0 for others.

Environment

TensorRT Version: 8.6.2.3
GPU Type: Nvidia Jetson Orin nx (16Gb ram)
Nvidia Driver Version: Jetpack 6.0 DP
CUDA Version: 12.02.140
CUDNN Version: 8.9.4.25
Operating System + Version: Ubuntu 22.04 LTS
Python Version (if applicable): 3.10.12

Relevant Files

Superglue pytorch implementation: https://github.com/magicleap/SuperGluePretrainedNetwork
convert_to_onnx: https://github.com/yuefanhao/SuperPoint-SuperGlue-TensorRT/blob/main/convert2onnx/convert_superglue_to_onnx.py
build_engine.py:

import tensorrt as trt

# Initialize TensorRT logger and builder
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
builder = trt.Builder(TRT_LOGGER)
config = builder.create_builder_config()


# Set cache
cache = config.create_timing_cache(b"")
config.set_timing_cache(cache, ignore_mismatch=False)


flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
network = builder.create_network(flag)
parser = trt.OnnxParser(network, TRT_LOGGER)

path_onnx_model = "/home/jetson/a/TensorRT/trt/superglue_outdoor_sim_int32.onnx"

with open(path_onnx_model, "rb") as f:
    if not parser.parse(f.read()):
        print(f"ERROR: Failed to parse the ONNX file {path_onnx_model}")
        for error in range(parser.num_errors):
            print(parser.get_error(error))


inputs = [network.get_input(i) for i in range(network.num_inputs)]
outputs = [network.get_output(i) for i in range(network.num_outputs)]
print(outputs[0])

profile = builder.create_optimization_profile()

min_shape = [1, 1, 2]
opt_shape = [1, 1024, 2]
max_shape = [1, 2048, 2]
profile.set_shape(inputs[0].name, min_shape, opt_shape, max_shape)

min_shape = [1, 1]
opt_shape = [1, 1024]
max_shape = [1, 2048]
profile.set_shape(inputs[1].name, min_shape, opt_shape, max_shape)

min_shape = [1, 256, 1]
opt_shape = [1, 256, 1024]
max_shape = [1, 256, 2048]
profile.set_shape(inputs[2].name, min_shape, opt_shape, max_shape)


min_shape = [1, 1, 2]
opt_shape = [1, 1024, 2]
max_shape = [1, 2048, 2]
profile.set_shape(inputs[3].name, min_shape, opt_shape, max_shape)

min_shape = [1, 1]
opt_shape = [1, 1024]
max_shape = [1, 2048]
profile.set_shape(inputs[4].name, min_shape, opt_shape, max_shape)


min_shape = [1, 256, 1]
opt_shape = [1, 256, 1024]
max_shape = [1, 256, 2048]
profile.set_shape(inputs[5].name, min_shape, opt_shape, max_shape)


config.add_optimization_profile(profile)


config.get_calibration_profile()


# Check if fast Half is avaliable
# print(builder.platform_has_fast_fp16)


config.set_flag(trt.BuilderFlag.FP16)

# Build engine
engine_bytes = builder.build_serialized_network(network, config)

engine_path = "superglue.engine"
with open(engine_path, "wb") as f:
    f.write(engine_bytes)

inference.py:

import numpy as np
import tensorrt as trt
from cuda import cuda, cudart
import ctypes
from typing import Optional, List
import torch




def check_cuda_err(err):
    if isinstance(err, cuda.CUresult):
        if err != cuda.CUresult.CUDA_SUCCESS:
            raise RuntimeError("Cuda Error: {}".format(err))
    if isinstance(err, cudart.cudaError_t):
        if err != cudart.cudaError_t.cudaSuccess:
            raise RuntimeError("Cuda Runtime Error: {}".format(err))
    else:
        raise RuntimeError("Unknown error type: {}".format(err))

def cuda_call(call):
    err, res = call[0], call[1:]
    check_cuda_err(err)
    if len(res) == 1:
        res = res[0]
    return res



class HostDeviceMem:
    """Pair of host and device memory, where the host memory is wrapped in a numpy array"""
    def __init__(self, size: int, dtype: np.dtype):
        nbytes = size * dtype.itemsize
        host_mem = cuda_call(cudart.cudaMallocHost(nbytes))
        pointer_type = ctypes.POINTER(np.ctypeslib.as_ctypes_type(dtype))

        self._host = np.ctypeslib.as_array(ctypes.cast(host_mem, pointer_type), (size,))
        self._device = cuda_call(cudart.cudaMalloc(nbytes))
        self._nbytes = nbytes

    @property
    def host(self) -> np.ndarray:
        return self._host

    @host.setter
    def host(self, arr: np.ndarray):
        if arr.size > self.host.size:
            raise ValueError(
                f"Tried to fit an array of size {arr.size} into host memory of size {self.host.size}"
            )
        #np.copyto(self.host[:arr.size], arr.flat, casting='safe')
        np.copyto(self.host[:arr.size], arr.flat)

    @property
    def device(self) -> int:
        return self._device

    @property
    def nbytes(self) -> int:
        return self._nbytes

    def __str__(self):
        return f"Host:\n{self.host}\nDevice:\n{self.device}\nSize:\n{self.nbytes}\n"

    def __repr__(self):
        return self.__str__()

    def free(self):
        cuda_call(cudart.cudaFree(self.device))
        cuda_call(cudart.cudaFreeHost(self.host.ctypes.data))


# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
# If engine uses dynamic shapes, specify a profile to find the maximum input & output size.
def allocate_buffers(engine: trt.ICudaEngine, inputs_shape):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda_call(cudart.cudaStreamCreate())
    tensor_names = [engine.get_tensor_name(i) for i in range(engine.num_io_tensors)]
    print("Tensor Names:", tensor_names)
    for binding in tensor_names:
        if engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
            # Correctly an input
            shape = engine.get_binding_shape(binding)
            print("Input Shape for binding index", binding, ":", shape)
        else:
            # It's an output, handle accordingly
            shape = engine.get_binding_shape(binding)
            print("Output at binding index", binding, ":", shape)
    for shape, binding in zip(inputs_shape, tensor_names):
        size = trt.volume(shape)
        #dtype = np.float32
       # Get tensor data type
        dtype = np.dtype(trt.nptype(engine.get_tensor_dtype(binding)))
        print(dtype)

        # Allocate host and device buffers
        bindingMemory = HostDeviceMem(size, dtype)

        # Append the device buffer to device bindings.
        bindings.append(int(bindingMemory.device))

        # Append to the appropriate list.
        if engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
            inputs.append(bindingMemory)
        else:
            print("injayim")
            print(dtype)
            outputs.append(bindingMemory)
    # for binding in tensor_names:
    #     # Debug: Print tensor name and mode
    #     print("Tensor:", binding)
    #     print("Mode:", engine.get_tensor_mode(binding))

    #     # Get tensor shape
    #     shape = engine.get_binding_shape(binding) if profile_idx is None else engine.get_tensor_profile_shape(binding, profile_idx)[-1]
    #     print("injast" , shape)
    #     #shape = engine.get_tensor_profile_shape(binding, profile_idx)[-1]

    #     print("Shape:", shape)

    #     # Ensure shape is valid
    #     shape_valid = np.all([s >= 0 for s in shape])
    #     if not shape_valid and profile_idx is None:
    #         raise ValueError(f"Binding {binding} has dynamic shape, " +\
    #             "but no profile was specified.")

    #     # Calculate buffer size
    #     size = trt.volume(shape)
    #     if engine.has_implicit_batch_dimension:
    #         print(engine.max_batch_size)
    #         size *= engine.max_batch_size
    #     print("Buffer Size:", size)

    #     # Get tensor data type
    #     dtype = np.dtype(trt.nptype(engine.get_tensor_dtype(binding)))

    #     # Allocate host and device buffers
    #     bindingMemory = HostDeviceMem(size, dtype)

    #     # Append the device buffer to device bindings.
    #     bindings.append(int(bindingMemory.device))

    #     # Append to the appropriate list.
    #     if engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
    #         inputs.append(bindingMemory)
    #     else:
    #         outputs.append(bindingMemory)

    return inputs, outputs, bindings, stream


# Frees the resources allocated in allocate_buffers
def free_buffers(inputs: List[HostDeviceMem], outputs: List[HostDeviceMem], stream: cudart.cudaStream_t):
    for mem in inputs + outputs:
        mem.free()
    cuda_call(cudart.cudaStreamDestroy(stream))


# Wrapper for cudaMemcpy which infers copy size and does error checking
def memcpy_host_to_device(device_ptr: int, host_arr: np.ndarray):
    nbytes = host_arr.size * host_arr.itemsize
    cuda_call(cudart.cudaMemcpy(device_ptr, host_arr, nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice))


# Wrapper for cudaMemcpy which infers copy size and does error checking
def memcpy_device_to_host(host_arr: np.ndarray, device_ptr: int):
    nbytes = host_arr.size * host_arr.itemsize
    cuda_call(cudart.cudaMemcpy(host_arr, device_ptr, nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost))


def _do_inference_base(inputs, outputs, stream, execute_async):
    # Transfer input data to the GPU.
    kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
    [cuda_call(cudart.cudaMemcpyAsync(inp.device, inp.host, inp.nbytes, kind, stream)) for inp in inputs]
    # Run inference.
    execute_async()
    # Transfer predictions back from the GPU.
    kind = cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
    [cuda_call(cudart.cudaMemcpyAsync(out.host, out.device, out.nbytes, kind, stream)) for out in outputs]
    # Synchronize the stream
    cuda_call(cudart.cudaStreamSynchronize(stream))
    # Return only the host outputs.
    return [out.host for out in outputs]


# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    def execute_async():
        context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream)
    return _do_inference_base(inputs, outputs, stream, execute_async)


# This function is generalized for multiple inputs/outputs for full dimension networks.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference_v2(context, bindings, inputs, outputs, stream):
    def execute_async():
        context.execute_async_v2(bindings=bindings, stream_handle=stream)
    return _do_inference_base(inputs, outputs, stream, execute_async)

    
kpts0 = np.random.randint(0,255,(1,14,2))/255
scores0 = np.random.randint(0,255,(1,1293))/255
desc0 = np.random.randint(0,255,(1,256,1293))/255
kpts1 = np.random.randint(0,255,(1,1246,2))/255
scores1 = np.random.randint(0,255,(1,1246))/255
desc1 = np.random.randint(0,255,(1,256,1246))/255


# # Example output for keypoints, scores, and descriptors for Image 0
# kpts0 = np.array([[[10, 20], [30, 40], [50, 60]]])  # Example keypoints for Image 0
# scores0 = np.array([[0.9, 0.8, 0.7]])  # Example scores for keypoints in Image 0
# desc0 = np.random.rand(1, 256, 3)  # Example descriptors for keypoints in Image 0
# print(kpts0.shape, scores0.shape, desc0.shape)

# # Example output for keypoints, scores, and descriptors for Image 1
# kpts1 = np.array([[[15, 25], [35, 45], [55, 65]]])  # Example keypoints for Image 1
# scores1 = np.array([[0.85, 0.75, 0.65]])  # Example scores for keypoints in Image 1
# desc1 = np.random.rand(1, 256, 3)  # Example descriptors for keypoints in Image 1
# print(kpts1.shape, scores1.shape, desc1.shape)
kpts0 = torch.load('/home/jetson/Downloads/keypoints.pt', map_location = torch.device('cpu'))
kpts0 = kpts0.numpy()

scores0 = torch.load('/home/jetson/Downloads/keypoint_scores.pt', map_location = torch.device('cpu'))
scores0 = scores0.numpy()

desc0 = torch.load('/home/jetson/Downloads/descriptors.pt', map_location = torch.device('cpu'))
desc0 = desc0.numpy()
desc0 = desc0.transpose(0, 2, 1)



kpts1 = torch.load('/home/jetson/Downloads/keypoints.pt', map_location = torch.device('cpu'))
kpts1 = kpts1.numpy()

scores1 = torch.load('/home/jetson/Downloads/keypoint_scores.pt', map_location = torch.device('cpu'))
scores1 = scores1.numpy()

desc1 = torch.load('/home/jetson/Downloads/descriptors.pt', map_location = torch.device('cpu'))
desc1 = desc1.numpy()
desc1 = desc1.transpose(0, 2, 1)

# permute_descriptor = descriptors[0].permute(1, 0)


# kpts0 = torch.unsqueeze(keypoints[0], 0).numpy()
# scores0 = torch.unsqueeze(scores[0], 0).numpy()
# desc0 = torch.unsqueeze(permute_descriptor, 0).numpy()
# kpts1 = torch.unsqueeze(keypoints[0], 0).numpy()
# scores1 = torch.unsqueeze(scores[0], 0).numpy()
# desc1 = torch.unsqueeze(permute_descriptor, 0).numpy()


# kpts0 = kpts0.astype(np.float32)  # Ensure data is in float32 format
# scores0 = scores0.astype(np.float32)  # Ensure data is in float32 format
# desc0 = desc0.astype(np.float32)  # Ensure data is in float32 format
# kpts1 = kpts1.astype(np.float32)  # Ensure data is in float32 format
# scores1 = scores1.astype(np.float32)  # Ensure data is in float32 format
# desc1 = desc1.astype(np.float32)  # Ensure data is in float32 format


# Function to load a TensorRT engine from a file
def load_engine(engine_file_path):
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        return runtime.deserialize_cuda_engine(f.read())

# Load the engine
engine_file_path = "/home/jetson/a/TensorRT/superglue.engine"
engine = load_engine(engine_file_path)



context = engine.create_execution_context()
input_binding_index = engine.get_binding_index("keypoints_0") 
context.set_binding_shape(input_binding_index, kpts0.shape)
input_binding_index = engine.get_binding_index("scores_0") 
context.set_binding_shape(input_binding_index, scores0.shape)
input_binding_index = engine.get_binding_index("descriptors_0")
context.set_binding_shape(input_binding_index, desc0.shape)
input_binding_index = engine.get_binding_index("keypoints_1")
context.set_binding_shape(input_binding_index, kpts1.shape)
input_binding_index = engine.get_binding_index("scores_1")  
context.set_binding_shape(input_binding_index, scores1.shape)
input_binding_index = engine.get_binding_index("descriptors_1")  
context.set_binding_shape(input_binding_index, desc1.shape)
inputs_shape = [kpts0.shape, scores0.shape, desc0.shape, kpts1.shape, scores1.shape, desc1.shape, (1, 100, 100)]
# y = engine.get_binding_index("scores")
# print("here", context.get_tensor_shape(0))
# Allocate memory for inputs and outputs
print("before allocate")
inputs, outputs, bindings, stream = allocate_buffers(engine, inputs_shape)
print('outputs: ', outputs)

print("after allocate")




output_data = do_inference_v2(context, bindings, inputs, outputs, stream)

# Process the output (example)
print("Output:", output_data[0])


# Free allocated memory
free_buffers(inputs, outputs, stream)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions