Skip to content

Error Code 1: Cask (Cask convolution execution) during inference with .to("cuda") #4335

@yu0o0

Description

@yu0o0

Description

I am encountering an issue when running inference using a TensorRT engine. Specifically, if I include to("cuda") in my PyTorch tensor operations, the inference results in errors and the outputs are filled with zeros. Here is the error message:

[TRT] [E] IExecutionContext::executeV2: Error Code 1: Cask (Cask convolution execution)
tensor([[[[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]],

     [[0., 0., 0.,  ..., 0., 0., 0.],
      [0., 0., 0.,  ..., 0., 0., 0.],
      [0., 0., 0.,  ..., 0., 0., 0.],
      [0., 0., 0.,  ..., 0., 0., 0.]]]])

If I remove to("cuda") during input tensor preparation, the issue does not occur, and the inference works as expected. I suspect this is related to how tensors are being transferred to the GPU.

Environment

TensorRT Version: 10.3

NVIDIA GPU:

NVIDIA Driver Version:

CUDA Version: 12.6.68

CUDNN Version: 9.3.0.75

Operating System:

Python Version (if applicable): 3.10

Tensorflow Version (if applicable):

PyTorch Version (if applicable):

Baremetal or Container (if so, version):

Relevant Files

import pycuda.driver as cuda
import pycuda.autoinit
import time
import numpy as np
import tensorrt as trt

class TensorRTInferencer:
def init(self, engine_path, device):
"""Initialize the TensorRT inferencer."""
# self.logger = trt.Logger(trt.Logger.WARNING)
self.logger = trt.Logger(trt.Logger.VERBOSE)
self.engine = self._load_engine(engine_path)
self.context = self.engine.create_execution_context()
self.device = device
print("TensorRT engine loaded and buffers allocated.")

def _load_engine(self, engine_path):
    """Load TensorRT engine from file."""
    with open(engine_path, 'rb') as f, trt.Runtime(self.logger) as runtime:
        return runtime.deserialize_cuda_engine(f.read())

def _allocate_buffers(self):
    """Allocate buffers for inputs and outputs."""
    buffers = {}
    stream = cuda.Stream()

    for i in range(self.engine.num_io_tensors):
        name = self.engine.get_tensor_name(i)
        mode = self.engine.get_tensor_mode(name)
        dtype = trt.nptype(self.engine.get_tensor_dtype(name))
        shape = self.context.get_tensor_shape(name)
        # print(shape)

        # Allocate memory with placeholder size for dynamic inputs
        size = trt.volume(shape) if -1 not in shape else 1
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)

        buffers[name] = {
            'host': host_mem,
            'device': device_mem,
            'dtype': dtype,
            'mode': mode
        }

    return buffers, stream

def allocate_buffers(self, input_shapes):
    buffers = {}
    stream = cuda.Stream()

    for i in range(self.engine.num_io_tensors):
        name = self.engine.get_tensor_name(i)
        mode = self.engine.get_tensor_mode(name)
        dtype = trt.nptype(self.engine.get_tensor_dtype(name))

        if mode == trt.TensorIOMode.INPUT:
            shape = input_shapes[name]
        else:
            shape = self.context.get_tensor_shape(name)
            shape = tuple(input_shapes["input:0"][0] if dim == -1 else dim for dim in shape)
        # print(shape)

        size = trt.volume(shape)
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)

        buffers[name] = {
            'host': host_mem,
            'device': device_mem,
            'dtype': dtype,
            'mode': mode
        }

    return buffers, stream


def infer(self, inputs):
    """Run inference with provided inputs."""
    # Set tensor addresses and copy inputs
    for name, buffer in self.buffers.items():
        self.context.set_tensor_address(name, int(buffer['device']))
        if buffer['mode'] == trt.TensorIOMode.INPUT:
            self.context.set_input_shape(name, inputs[name].shape)
            np.copyto(buffer['host'], inputs[name].ravel())
            cuda.memcpy_htod_async(buffer['device'], buffer['host'], self.stream)

    # Execute inference
    self.context.execute_async_v3(stream_handle=self.stream.handle)

    # Copy outputs back to host
    results = {}
    for name, buffer in self.buffers.items():
        if buffer['mode'] == trt.TensorIOMode.OUTPUT:
            cuda.memcpy_dtoh_async(buffer['host'], buffer['device'], self.stream)
            results[name] = buffer['host']
    self.stream.synchronize()

    return results

def __call__(self, inputs):
    inputs_dict = {}
    for input in inputs:
        # Set dynamic shapes
        self.context.set_input_shape(input["name"], input["data"].shape)

        # Prepare inputs
        inputs_dict[input["name"]] = input["data"].cpu().numpy().astype(np.float32)

    # Allocate buffers
    self.buffers, self.stream = self._allocate_buffers()

    # Run inference
    results = self.infer(inputs_dict)

    # Reshape outputs based on tensor shapes
    # reshaped_outputs = {
    #     name: results[name].reshape(self.context.get_tensor_shape(name))
    #     for name, buffer in self.buffers.items() if buffer['mode'] == trt.TensorIOMode.OUTPUT
    # }
    reshaped_outputs = {
        name: results[name].reshape(self.context.get_tensor_shape(name))
        for name, buffer in self.buffers.items() if buffer['mode'] == trt.TensorIOMode.OUTPUT
    }
    output = torch.from_numpy([*reshaped_outputs.values()][0])
    return output

def get_tensor_info(self):
    """Retrieve tensor information from the engine."""
    tensor_info = []
    for i in range(self.engine.num_io_tensors):
        name = self.engine.get_tensor_name(i)
        mode = "Input" if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT else "Output"
        shape = self.context.get_tensor_shape(name)
        tensor_info.append({'name': name, 'mode': mode, 'shape': shape})
    return tensor_info

def speed_test(inferencer, bs=1, run_times=10):
tensor_info = inferencer.get_tensor_info()
inputs = list()
for info in tensor_info:
if info['mode'] == "Input":
inputs.append(dict(
name=info['name'],
shape=tuple(bs if dim == -1 else dim for dim in info["shape"]),
data=None
))

total_time = 0
for i in range(run_times+1):
    for input in inputs:
        input["data"] = torch.tensor(np.random.randn(*input["shape"]), dtype=torch.float32).to("cuda")
    t1 = time.time()
    outputs = inferencer(inputs)
    
    if i > 0:
        print(outputs)
        total_time += time.time() - t1

# for name, output in outputs.items():
#     print(f"Output {name}: {output.shape}")
#     # print(output)
print(f"Average time over {run_times} runs: {total_time / run_times:.3f}s")

def show_info(inferencer):
# Print tensor information
tensor_info = inferencer.get_tensor_info()
print("\nTensor information:")
for info in tensor_info:
print(f"Tensor: {info['name']}, Mode: {info['mode']}, Shape: {info['shape']}")

if name == "main":
import torch

# Initialize the TensorRT inferencer
engine_path = (
    "241203_symmetry_WS&GN_3s.trt"
    )
inferencer = TensorRTInferencer(engine_path, "cuda:0")

# show_info(inferencer)
speed_test(inferencer, bs=1, run_times=1)

Metadata

Metadata

Assignees

No one assigned

    Labels

    triagedIssue has been triaged by maintainers

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions