Error Code 1: Cask (Cask convolution execution) during inference with .to("cuda")

## Description

I am encountering an issue when running inference using a TensorRT engine. Specifically, if I include to("cuda") in my PyTorch tensor operations, the inference results in errors and the outputs are filled with zeros. Here is the error message:

[TRT] [E] IExecutionContext::executeV2: Error Code 1: Cask (Cask convolution execution)
tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]]])

If I remove to("cuda") during input tensor preparation, the issue does not occur, and the inference works as expected. I suspect this is related to how tensors are being transferred to the GPU.


## Environment



**TensorRT Version**: 10.3

**NVIDIA GPU**:

**NVIDIA Driver Version**:

**CUDA Version**:  12.6.68

**CUDNN Version**: 9.3.0.75


Operating System:

Python Version (if applicable): 3.10

Tensorflow Version (if applicable):

PyTorch Version (if applicable):

Baremetal or Container (if so, version):


## Relevant Files



import pycuda.driver as cuda
import pycuda.autoinit
import time
import numpy as np
import tensorrt as trt



class TensorRTInferencer:
    def __init__(self, engine_path, device):
        """Initialize the TensorRT inferencer."""
        # self.logger = trt.Logger(trt.Logger.WARNING)
        self.logger = trt.Logger(trt.Logger.VERBOSE)
        self.engine = self._load_engine(engine_path)
        self.context = self.engine.create_execution_context()
        self.device = device
        print("TensorRT engine loaded and buffers allocated.")

    def _load_engine(self, engine_path):
        """Load TensorRT engine from file."""
        with open(engine_path, 'rb') as f, trt.Runtime(self.logger) as runtime:
            return runtime.deserialize_cuda_engine(f.read())

    def _allocate_buffers(self):
        """Allocate buffers for inputs and outputs."""
        buffers = {}
        stream = cuda.Stream()

        for i in range(self.engine.num_io_tensors):
            name = self.engine.get_tensor_name(i)
            mode = self.engine.get_tensor_mode(name)
            dtype = trt.nptype(self.engine.get_tensor_dtype(name))
            shape = self.context.get_tensor_shape(name)
            # print(shape)

            # Allocate memory with placeholder size for dynamic inputs
            size = trt.volume(shape) if -1 not in shape else 1
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)

            buffers[name] = {
                'host': host_mem,
                'device': device_mem,
                'dtype': dtype,
                'mode': mode
            }

        return buffers, stream
    
    def allocate_buffers(self, input_shapes):
        buffers = {}
        stream = cuda.Stream()

        for i in range(self.engine.num_io_tensors):
            name = self.engine.get_tensor_name(i)
            mode = self.engine.get_tensor_mode(name)
            dtype = trt.nptype(self.engine.get_tensor_dtype(name))

            if mode == trt.TensorIOMode.INPUT:
                shape = input_shapes[name]
            else:
                shape = self.context.get_tensor_shape(name)
                shape = tuple(input_shapes["input:0"][0] if dim == -1 else dim for dim in shape)
            # print(shape)

            size = trt.volume(shape)
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)

            buffers[name] = {
                'host': host_mem,
                'device': device_mem,
                'dtype': dtype,
                'mode': mode
            }

        return buffers, stream


    def infer(self, inputs):
        """Run inference with provided inputs."""
        # Set tensor addresses and copy inputs
        for name, buffer in self.buffers.items():
            self.context.set_tensor_address(name, int(buffer['device']))
            if buffer['mode'] == trt.TensorIOMode.INPUT:
                self.context.set_input_shape(name, inputs[name].shape)
                np.copyto(buffer['host'], inputs[name].ravel())
                cuda.memcpy_htod_async(buffer['device'], buffer['host'], self.stream)

        # Execute inference
        self.context.execute_async_v3(stream_handle=self.stream.handle)

        # Copy outputs back to host
        results = {}
        for name, buffer in self.buffers.items():
            if buffer['mode'] == trt.TensorIOMode.OUTPUT:
                cuda.memcpy_dtoh_async(buffer['host'], buffer['device'], self.stream)
                results[name] = buffer['host']
        self.stream.synchronize()

        return results
    
    def __call__(self, inputs):
        inputs_dict = {}
        for input in inputs:
            # Set dynamic shapes
            self.context.set_input_shape(input["name"], input["data"].shape)

            # Prepare inputs
            inputs_dict[input["name"]] = input["data"].cpu().numpy().astype(np.float32)

        # Allocate buffers
        self.buffers, self.stream = self._allocate_buffers()

        # Run inference
        results = self.infer(inputs_dict)

        # Reshape outputs based on tensor shapes
        # reshaped_outputs = {
        #     name: results[name].reshape(self.context.get_tensor_shape(name))
        #     for name, buffer in self.buffers.items() if buffer['mode'] == trt.TensorIOMode.OUTPUT
        # }
        reshaped_outputs = {
            name: results[name].reshape(self.context.get_tensor_shape(name))
            for name, buffer in self.buffers.items() if buffer['mode'] == trt.TensorIOMode.OUTPUT
        }
        output = torch.from_numpy([*reshaped_outputs.values()][0])
        return output

    def get_tensor_info(self):
        """Retrieve tensor information from the engine."""
        tensor_info = []
        for i in range(self.engine.num_io_tensors):
            name = self.engine.get_tensor_name(i)
            mode = "Input" if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT else "Output"
            shape = self.context.get_tensor_shape(name)
            tensor_info.append({'name': name, 'mode': mode, 'shape': shape})
        return tensor_info


def speed_test(inferencer, bs=1, run_times=10):
    tensor_info = inferencer.get_tensor_info()
    inputs = list()
    for info in tensor_info:
        if info['mode'] == "Input":
            inputs.append(dict(
                name=info['name'],
                shape=tuple(bs if dim == -1 else dim for dim in info["shape"]),
                data=None
            ))

    total_time = 0
    for i in range(run_times+1):
        for input in inputs:
            input["data"] = torch.tensor(np.random.randn(*input["shape"]), dtype=torch.float32).to("cuda")
        t1 = time.time()
        outputs = inferencer(inputs)
        
        if i > 0:
            print(outputs)
            total_time += time.time() - t1

    # for name, output in outputs.items():
    #     print(f"Output {name}: {output.shape}")
    #     # print(output)
    print(f"Average time over {run_times} runs: {total_time / run_times:.3f}s")

def show_info(inferencer):
    # Print tensor information
    tensor_info = inferencer.get_tensor_info()
    print("\nTensor information:")
    for info in tensor_info:
        print(f"Tensor: {info['name']}, Mode: {info['mode']}, Shape: {info['shape']}")

if __name__ == "__main__":
    import torch

    # Initialize the TensorRT inferencer
    engine_path = (
        "241203_symmetry_WS&GN_3s.trt"
        )
    inferencer = TensorRTInferencer(engine_path, "cuda:0")

    # show_info(inferencer)
    speed_test(inferencer, bs=1, run_times=1)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Error Code 1: Cask (Cask convolution execution) during inference with .to("cuda") #4335

Description

Environment

Relevant Files

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Error Code 1: Cask (Cask convolution execution) during inference with .to("cuda") #4335

Description

Description

Environment

Relevant Files

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions