executeV2 failure of TensorRT 10.7.0.23 when running inference on T1200 GPU, Tensor "output" is bound to nullptr

## Description

I get an error:
> [TensorRT] IExecutionContext::executeV2: Error Code 3: API Usage Error (Parameter check failed, condition: nullPtrAllowed. Tensor "output" is bound to nullptr, which is allowed only for an empty input tensor, shape tensor, or an output tensor associated with an IOuputAllocator.)

while running a toy example inference, even though the memory seems to be allocated correctly.

```c++
#include <NvInfer.h>
#include <cuda_runtime_api.h>
#include <fstream>
#include <vector>
#include <iostream>
#include <memory>


#define ALIGN_TO(value, alignment) (((value) + (alignment) - 1) & ~((alignment) - 1))



// Function to load the engine from a file
std::vector<char> loadEngineFile(const std::string& engineFilePath) {
    std::ifstream file(engineFilePath, std::ios::binary);
    if (!file) throw std::runtime_error("Failed to open engine file.");
    return std::vector<char>((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
}


// Logger class implementing TensorRT's ILogger interface
class Logger : public nvinfer1::ILogger {
public:
    void log(Severity severity, const char* msg) noexcept override {
        // Only print messages of severity higher than INFO
        if (severity <= Severity::kINFO) {
            std::cerr << "[TensorRT] " << msg << std::endl;
        }
    }
};


// Function to print elements from a void* blob
void printBlob(const void* blob, size_t count, size_t elementSize, const std::string& type) {
    if (blob == nullptr) {
        std::cerr << "Error: blob is null!" << std::endl;
        return;
    }

    // float* host_data = new float[count];
    // cudaMemcpy(host_data, blob, count * sizeof(float), cudaMemcpyDeviceToHost);

    if (type == "int") {
        const int* data = static_cast<const int*>(blob);
        for (size_t i = 0; i < count; ++i) {
            std::cout << data[i] << " ";
        }
    }
    std::cout << std::endl;
}


// Global logger instance
Logger gLogger;


int getTotalSize(nvinfer1::Dims& dims) {
    int size = 1;
    for (int i = 0; i < dims.nbDims; i++) {
        size *= dims.d[i];
    }
    return size;
}


std::vector<nvinfer1::Dims> getTensorShapes(std::vector<std::string> tensorNames, nvinfer1::ICudaEngine* engine) {
    std::vector<nvinfer1::Dims> shapesToReturn;
    for (auto tensorName : tensorNames) {
        nvinfer1::Dims shape = engine->getTensorShape(tensorName.c_str());
        shapesToReturn.push_back(shape);
        std::string dimensions;
        for (int i = 0; i < shape.nbDims; i++) {
            dimensions += std::to_string(shape.d[i]);
            if (i < shape.nbDims - 1) {
                dimensions += ", ";
            }
        }
        gLogger.log(nvinfer1::ILogger::Severity::kINFO, ("\t" + tensorName + ", dim: (" + dimensions + "), total_size = " + std::to_string(getTotalSize(shape))).c_str());

        // Let's make sure the data types are all float32
        nvinfer1::DataType type = engine->getTensorDataType(tensorName.c_str());
        if (type != nvinfer1::DataType::kFLOAT) {
            gLogger.log(nvinfer1::ILogger::Severity::kERROR, "Only float32 inputs are supported.");
            exit(1);
        }
    }
    return shapesToReturn;
}

void** allocateDeviceMemory(std::vector<nvinfer1::Dims> shapes) {
    void** buffers = new void*[shapes.size()];
    size_t freeMem, totalMem;

    for (size_t i = 0; i < shapes.size(); i++) {
        auto shape = shapes[i];
        size_t totalSize = getTotalSize(shape);
        size_t aligned_memory_size = ALIGN_TO(totalSize * sizeof(float), 256);
        std::cout << "Aligned memory size: " << aligned_memory_size << std::endl;

        cudaMemGetInfo(&freeMem, &totalMem);
        std::cout << "Free memory: " << freeMem << " / Total memory: " << totalMem << std::endl;

        // Allocate memory
        cudaError_t err = cudaMalloc(reinterpret_cast<void**>(&buffers[i]), aligned_memory_size);
        if (err != cudaSuccess) {
            gLogger.log(nvinfer1::ILogger::Severity::kERROR, "Failed to allocate device memory.");
            std::cerr << "cudaMalloc failed for buffer " << i << " with error: " << cudaGetErrorString(err) << std::endl;
            exit(1);
        }
        
        // Set all values to 0
        int value = 13;
        err = cudaMemset(buffers[i], value, aligned_memory_size);
        if (err != cudaSuccess) {
            gLogger.log(nvinfer1::ILogger::Severity::kERROR, "Failed to set device memory.");
            std::cerr << "cudaMemset failed for buffer " << i << " with error: " << cudaGetErrorString(err) << std::endl;
            exit(1);
        }
        
        // Verify that the buffer is not null
        if (buffers[i] == nullptr) {
            gLogger.log(nvinfer1::ILogger::Severity::kERROR, "Buffer is null after allocation.");
            std::cerr << "Buffer " << i << " is null after allocation." << std::endl;
            exit(1);
        }

        // Get the number of elements in the shape
        int numElements = getTotalSize(shape); // Ensure this correctly gives the number of elements

        // Print the allocated memory
        std::cout << "Printing blob for buffer " << i << std::endl;
        int* host_data = new int[10];
        cudaMemcpy(host_data, buffers[i], 10 * sizeof(int), cudaMemcpyDeviceToHost);
        printBlob(host_data, 10, sizeof(int), "int");
        std::cout << "Done printing blob for buffer " << i << std::endl;
    }
    return buffers;
}

int main() {
    // Initialize TensorRT runtime
    nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(gLogger);
    if (!runtime) {
        gLogger.log(nvinfer1::ILogger::Severity::kERROR, "Failed to create TensorRT runtime.");
        return 1;
    }

    // Load engine file
    std::vector<char> engineData = loadEngineFile("/workspace/onnx_to_tensorrt/super_resolution.engine");

    // Deserialize engine
    nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(engineData.data(), engineData.size());

    // List out inputs / outputs for the baseline model
    std::vector<std::string> inputTensorNames = {
        "input",
    };
    std::vector<std::string> outputTensorNames = {
        "output",
    };

    // Make sure inputs are correct
    gLogger.log(nvinfer1::ILogger::Severity::kINFO, "Input tensor names:");
    std::vector<nvinfer1::Dims> inputShapes = getTensorShapes(inputTensorNames, engine);

    // Make sure outputs are correct
    gLogger.log(nvinfer1::ILogger::Severity::kINFO, "Output tensor names:");
    std::vector<nvinfer1::Dims> outputShapes = getTensorShapes(outputTensorNames, engine);

    // Allocate device memory for inputs and outputs
    void* const* inputBuffers = allocateDeviceMemory(inputShapes);
    void* const* outputBuffers = allocateDeviceMemory(outputShapes); // We need this as well



    // Debug: Log allocated memory addresses and sizes
    nvinfer1::IExecutionContext* context = engine->createExecutionContext();
    for (size_t i = 0; i < inputShapes.size(); ++i) {
        size_t inputSize = getTotalSize(inputShapes[i]);
        auto const name = inputTensorNames[i];

        if (!context->setTensorAddress(name.c_str(), inputBuffers[i])) {
            gLogger.log(nvinfer1::ILogger::Severity::kERROR, "Failed to set tensor address.");
            exit(1);
        }
        gLogger.log(nvinfer1::ILogger::Severity::kINFO, ("Successfully bound input buffer \"" + inputTensorNames[i] + "\" address: " + std::to_string(reinterpret_cast<uintptr_t>(inputBuffers[i])) + ", size: " + std::to_string(inputSize)).c_str());

        int* host_data = new int[10];
        cudaMemcpy(host_data, inputBuffers[i], 10 * sizeof(int), cudaMemcpyDeviceToHost);
        printBlob(host_data, 10, sizeof(int), "int");
    }
    for (size_t i = 0; i < outputShapes.size(); ++i) {
        size_t outputSize = getTotalSize(outputShapes[i]);
        auto const name = outputTensorNames[i];

        if (!context->setTensorAddress(name.c_str(), outputBuffers[i])) {
            gLogger.log(nvinfer1::ILogger::Severity::kERROR, "Failed to set tensor address.");
            exit(1);
        }
        gLogger.log(nvinfer1::ILogger::Severity::kINFO, ("Successfully bound output buffer \"" + name + "\" address: " + std::to_string(reinterpret_cast<uintptr_t>(outputBuffers[i])) + ", size: " + std::to_string(outputSize)).c_str());

        int* host_data = new int[10];
        cudaMemcpy(host_data, outputBuffers[i], 10 * sizeof(int), cudaMemcpyDeviceToHost);
        printBlob(host_data, 10, sizeof(int), "int");
    }



    // Print out the names of the tensors
    for (int i = 0; i < engine->getNbIOTensors(); i++) {
        nvinfer1::TensorLocation tensor_location = engine->getTensorLocation(engine->getIOTensorName(i));

        std::string device = (tensor_location == nvinfer1::TensorLocation::kDEVICE) ? "GPU" : "CPU";

        gLogger.log(nvinfer1::ILogger::Severity::kINFO, ("Tensor name: \"" + std::string(engine->getIOTensorName(i)) + "\", device: " + device).c_str());
    }


    // Execute inference
    context->setDebugSync(true);
    std::cout << "Debug state: " << context->getDebugSync() << std::endl;
    bool executionSuccessful = context->executeV2(inputBuffers);
    if (!executionSuccessful) {
        gLogger.log(nvinfer1::ILogger::Severity::kERROR, "Inference execution failed.");
    } else {
        gLogger.log(nvinfer1::ILogger::Severity::kINFO, "Inference execution successful!!!!!!!!!!!!!!!!!!!!!!!!!");
    }

    // TODO: Do something with the output data
    // Copy output data to host
    // cudaMemcpy(hostOutputData, buffers[outputIndex], outputSize * sizeof(float), cudaMemcpyDeviceToHost);

    // // Release resources
    for (size_t i = 0; i < inputShapes.size(); i++) {
        cudaFree(inputBuffers[i]);
    }
    for (size_t i = 0; i < outputShapes.size(); i++) {
        cudaFree(outputBuffers[i]);
    }
    delete[] inputBuffers;
    delete[] outputBuffers;
    // context->destroy();
    // engine->destroy();
    // runtime->destroy();

    return 0;
}
```

## Environment

I'm running the code in a container provided in this repo (but the 10.7 release, check details below), launching it with:
> ./docker/launch.sh --tag tensorrt-ubuntu20.04-cuda12.6 --gpus all

**TensorRT Version**: 10.7, I'm at the following commit hash:
> 97ff2448 (HEAD -> release/10.7, origin/release/10.7, origin/HEAD) Add 2025 Q1 roadmap (#4279)

**NVIDIA GPU**: T1200

**NVIDIA Driver Version**: 535.183.01

**CUDA Version**: 12.6

**CUDNN Version**: 8.9.6.50 (I got it from the `ubuntu-20.04.Dockerfile`: https://github.com/NVIDIA/TensorRT/blob/release/10.7/docker/ubuntu-22.04.Dockerfile)

Operating System: Ubuntu 20.04 (not just the container, I mean my host OS is the same)

Python Version (if applicable): Not applicable.

Tensorflow Version (if applicable): Not applicable.

PyTorch Version (if applicable): Not applicable.

Baremetal or Container (if so, version): https://github.com/NVIDIA/TensorRT/blob/release/10.7/docker/ubuntu-22.04.Dockerfile


## Relevant Files

I load in the code a `super_resolution.engine` file, I supplied it below.

**Model link**: https://drive.google.com/file/d/1c02RKBQDTJ-mo9WYh93xZy5nokEwJx_r/view?usp=sharing


## Steps To Reproduce

To build the code I use the following `CMakeLists.txt`:
```cmake
cmake_minimum_required(VERSION 3.16)
project(tensorrt_minimalistic)

# Set C++ standard and optimization flags
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Ofast -DNDEBUG -Wno-deprecated-declarations")

# For finding FindTensorRT.cmake
set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})

# Specify the path to TensorRT root directory (modify as needed)
if (NOT TensorRT_DIR)
    set(TensorRT_DIR /workspace/TensorRT/)
endif()

# Set CUDA root directory (modify as needed)
set(CUDA_TOOLKIT_ROOT_DIR /usr/local/cuda)

# Include TensorRT and CUDA
find_package(TensorRT REQUIRED)
find_package(CUDA REQUIRED)

# Add include directories
include_directories(${CUDA_INCLUDE_DIRS} ${TensorRT_INCLUDE_DIRS} include)


# Add the main executable
add_executable(main main.cpp)
# Link TensorRT and CUDA libraries
target_link_libraries(dupa PUBLIC ${CUDA_LIBRARIES} ${TensorRT_LIBRARIES})
```
and build it with the standard:
```bash
mkdir build; cd build; cmake ..; make
```
and then call it with:
```bash
./main
```
which gives me the following output:
```
[TensorRT] Loaded engine size: 0 MiB
[TensorRT] Input tensor names:
[TensorRT]      input, dim: (1, 1, 224, 224), total_size = 50176
[TensorRT] Output tensor names:
[TensorRT]      output, dim: (1, 1, 672, 672), total_size = 451584
Aligned memory size: 200704
Free memory: 3885301760 / Total memory: 4093509632
Printing blob for buffer 0
218959117 218959117 218959117 218959117 218959117 218959117 218959117 218959117 218959117 218959117 
Done printing blob for buffer 0
Aligned memory size: 1806336
Free memory: 3885301760 / Total memory: 4093509632
Printing blob for buffer 0
218959117 218959117 218959117 218959117 218959117 218959117 218959117 218959117 218959117 218959117 
Done printing blob for buffer 0
[TensorRT] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +25, now: CPU 0, GPU 25 (MiB)
[TensorRT] Successfully bound input buffer "input" address: 140083909607936, size: 50176
218959117 218959117 218959117 218959117 218959117 218959117 218959117 218959117 218959117 218959117 
[TensorRT] Successfully bound output buffer "output" address: 140083910934528, size: 451584
218959117 218959117 218959117 218959117 218959117 218959117 218959117 218959117 218959117 218959117 
[TensorRT] Tensor name: "input", device: GPU
[TensorRT] Tensor name: "output", device: GPU
Debug state: 1
[TensorRT] IExecutionContext::executeV2: Error Code 3: API Usage Error (Parameter check failed, condition: nullPtrAllowed. Tensor "output" is bound to nullptr, which is allowed only for an empty input tensor, shape tensor, or an output tensor associated with an IOuputAllocator.)
[TensorRT] Inference execution failed.
```

**Commands or scripts**: `./main` (see details about building above)

**Have you tried [the latest release](https://developer.nvidia.com/tensorrt)?**: No, but I just noticed the 10.8 release. Should I give it a try?

**Can this model run on other frameworks?** For example run ONNX model with ONNXRuntime (`polygraphy run <model.onnx> --onnxrt`): No idea.


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

executeV2 failure of TensorRT 10.7.0.23 when running inference on T1200 GPU, Tensor "output" is bound to nullptr #4344

Description

Environment

Relevant Files

Steps To Reproduce

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

executeV2 failure of TensorRT 10.7.0.23 when running inference on T1200 GPU, Tensor "output" is bound to nullptr #4344

Description

Description

Environment

Relevant Files

Steps To Reproduce

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions