-
Notifications
You must be signed in to change notification settings - Fork 2.3k
Description
Description
I am encountering an issue when running inference using a TensorRT engine. Specifically, if I include to("cuda") in my PyTorch tensor operations, the inference results in errors and the outputs are filled with zeros. Here is the error message:
[TRT] [E] IExecutionContext::executeV2: Error Code 1: Cask (Cask convolution execution)
tensor([[[[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]],
[[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]]]])
If I remove to("cuda") during input tensor preparation, the issue does not occur, and the inference works as expected. I suspect this is related to how tensors are being transferred to the GPU.
Environment
TensorRT Version: 10.3
NVIDIA GPU:
NVIDIA Driver Version:
CUDA Version: 12.6.68
CUDNN Version: 9.3.0.75
Operating System:
Python Version (if applicable): 3.10
Tensorflow Version (if applicable):
PyTorch Version (if applicable):
Baremetal or Container (if so, version):
Relevant Files
import pycuda.driver as cuda
import pycuda.autoinit
import time
import numpy as np
import tensorrt as trt
class TensorRTInferencer:
def init(self, engine_path, device):
"""Initialize the TensorRT inferencer."""
# self.logger = trt.Logger(trt.Logger.WARNING)
self.logger = trt.Logger(trt.Logger.VERBOSE)
self.engine = self._load_engine(engine_path)
self.context = self.engine.create_execution_context()
self.device = device
print("TensorRT engine loaded and buffers allocated.")
def _load_engine(self, engine_path):
"""Load TensorRT engine from file."""
with open(engine_path, 'rb') as f, trt.Runtime(self.logger) as runtime:
return runtime.deserialize_cuda_engine(f.read())
def _allocate_buffers(self):
"""Allocate buffers for inputs and outputs."""
buffers = {}
stream = cuda.Stream()
for i in range(self.engine.num_io_tensors):
name = self.engine.get_tensor_name(i)
mode = self.engine.get_tensor_mode(name)
dtype = trt.nptype(self.engine.get_tensor_dtype(name))
shape = self.context.get_tensor_shape(name)
# print(shape)
# Allocate memory with placeholder size for dynamic inputs
size = trt.volume(shape) if -1 not in shape else 1
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
buffers[name] = {
'host': host_mem,
'device': device_mem,
'dtype': dtype,
'mode': mode
}
return buffers, stream
def allocate_buffers(self, input_shapes):
buffers = {}
stream = cuda.Stream()
for i in range(self.engine.num_io_tensors):
name = self.engine.get_tensor_name(i)
mode = self.engine.get_tensor_mode(name)
dtype = trt.nptype(self.engine.get_tensor_dtype(name))
if mode == trt.TensorIOMode.INPUT:
shape = input_shapes[name]
else:
shape = self.context.get_tensor_shape(name)
shape = tuple(input_shapes["input:0"][0] if dim == -1 else dim for dim in shape)
# print(shape)
size = trt.volume(shape)
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
buffers[name] = {
'host': host_mem,
'device': device_mem,
'dtype': dtype,
'mode': mode
}
return buffers, stream
def infer(self, inputs):
"""Run inference with provided inputs."""
# Set tensor addresses and copy inputs
for name, buffer in self.buffers.items():
self.context.set_tensor_address(name, int(buffer['device']))
if buffer['mode'] == trt.TensorIOMode.INPUT:
self.context.set_input_shape(name, inputs[name].shape)
np.copyto(buffer['host'], inputs[name].ravel())
cuda.memcpy_htod_async(buffer['device'], buffer['host'], self.stream)
# Execute inference
self.context.execute_async_v3(stream_handle=self.stream.handle)
# Copy outputs back to host
results = {}
for name, buffer in self.buffers.items():
if buffer['mode'] == trt.TensorIOMode.OUTPUT:
cuda.memcpy_dtoh_async(buffer['host'], buffer['device'], self.stream)
results[name] = buffer['host']
self.stream.synchronize()
return results
def __call__(self, inputs):
inputs_dict = {}
for input in inputs:
# Set dynamic shapes
self.context.set_input_shape(input["name"], input["data"].shape)
# Prepare inputs
inputs_dict[input["name"]] = input["data"].cpu().numpy().astype(np.float32)
# Allocate buffers
self.buffers, self.stream = self._allocate_buffers()
# Run inference
results = self.infer(inputs_dict)
# Reshape outputs based on tensor shapes
# reshaped_outputs = {
# name: results[name].reshape(self.context.get_tensor_shape(name))
# for name, buffer in self.buffers.items() if buffer['mode'] == trt.TensorIOMode.OUTPUT
# }
reshaped_outputs = {
name: results[name].reshape(self.context.get_tensor_shape(name))
for name, buffer in self.buffers.items() if buffer['mode'] == trt.TensorIOMode.OUTPUT
}
output = torch.from_numpy([*reshaped_outputs.values()][0])
return output
def get_tensor_info(self):
"""Retrieve tensor information from the engine."""
tensor_info = []
for i in range(self.engine.num_io_tensors):
name = self.engine.get_tensor_name(i)
mode = "Input" if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT else "Output"
shape = self.context.get_tensor_shape(name)
tensor_info.append({'name': name, 'mode': mode, 'shape': shape})
return tensor_info
def speed_test(inferencer, bs=1, run_times=10):
tensor_info = inferencer.get_tensor_info()
inputs = list()
for info in tensor_info:
if info['mode'] == "Input":
inputs.append(dict(
name=info['name'],
shape=tuple(bs if dim == -1 else dim for dim in info["shape"]),
data=None
))
total_time = 0
for i in range(run_times+1):
for input in inputs:
input["data"] = torch.tensor(np.random.randn(*input["shape"]), dtype=torch.float32).to("cuda")
t1 = time.time()
outputs = inferencer(inputs)
if i > 0:
print(outputs)
total_time += time.time() - t1
# for name, output in outputs.items():
# print(f"Output {name}: {output.shape}")
# # print(output)
print(f"Average time over {run_times} runs: {total_time / run_times:.3f}s")
def show_info(inferencer):
# Print tensor information
tensor_info = inferencer.get_tensor_info()
print("\nTensor information:")
for info in tensor_info:
print(f"Tensor: {info['name']}, Mode: {info['mode']}, Shape: {info['shape']}")
if name == "main":
import torch
# Initialize the TensorRT inferencer
engine_path = (
"241203_symmetry_WS&GN_3s.trt"
)
inferencer = TensorRTInferencer(engine_path, "cuda:0")
# show_info(inferencer)
speed_test(inferencer, bs=1, run_times=1)