-
Notifications
You must be signed in to change notification settings - Fork 2.3k
Description
Description
I get an error:
[TensorRT] IExecutionContext::executeV2: Error Code 3: API Usage Error (Parameter check failed, condition: nullPtrAllowed. Tensor "output" is bound to nullptr, which is allowed only for an empty input tensor, shape tensor, or an output tensor associated with an IOuputAllocator.)
while running a toy example inference, even though the memory seems to be allocated correctly.
#include <NvInfer.h>
#include <cuda_runtime_api.h>
#include <fstream>
#include <vector>
#include <iostream>
#include <memory>
#define ALIGN_TO(value, alignment) (((value) + (alignment) - 1) & ~((alignment) - 1))
// Function to load the engine from a file
std::vector<char> loadEngineFile(const std::string& engineFilePath) {
std::ifstream file(engineFilePath, std::ios::binary);
if (!file) throw std::runtime_error("Failed to open engine file.");
return std::vector<char>((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
}
// Logger class implementing TensorRT's ILogger interface
class Logger : public nvinfer1::ILogger {
public:
void log(Severity severity, const char* msg) noexcept override {
// Only print messages of severity higher than INFO
if (severity <= Severity::kINFO) {
std::cerr << "[TensorRT] " << msg << std::endl;
}
}
};
// Function to print elements from a void* blob
void printBlob(const void* blob, size_t count, size_t elementSize, const std::string& type) {
if (blob == nullptr) {
std::cerr << "Error: blob is null!" << std::endl;
return;
}
// float* host_data = new float[count];
// cudaMemcpy(host_data, blob, count * sizeof(float), cudaMemcpyDeviceToHost);
if (type == "int") {
const int* data = static_cast<const int*>(blob);
for (size_t i = 0; i < count; ++i) {
std::cout << data[i] << " ";
}
}
std::cout << std::endl;
}
// Global logger instance
Logger gLogger;
int getTotalSize(nvinfer1::Dims& dims) {
int size = 1;
for (int i = 0; i < dims.nbDims; i++) {
size *= dims.d[i];
}
return size;
}
std::vector<nvinfer1::Dims> getTensorShapes(std::vector<std::string> tensorNames, nvinfer1::ICudaEngine* engine) {
std::vector<nvinfer1::Dims> shapesToReturn;
for (auto tensorName : tensorNames) {
nvinfer1::Dims shape = engine->getTensorShape(tensorName.c_str());
shapesToReturn.push_back(shape);
std::string dimensions;
for (int i = 0; i < shape.nbDims; i++) {
dimensions += std::to_string(shape.d[i]);
if (i < shape.nbDims - 1) {
dimensions += ", ";
}
}
gLogger.log(nvinfer1::ILogger::Severity::kINFO, ("\t" + tensorName + ", dim: (" + dimensions + "), total_size = " + std::to_string(getTotalSize(shape))).c_str());
// Let's make sure the data types are all float32
nvinfer1::DataType type = engine->getTensorDataType(tensorName.c_str());
if (type != nvinfer1::DataType::kFLOAT) {
gLogger.log(nvinfer1::ILogger::Severity::kERROR, "Only float32 inputs are supported.");
exit(1);
}
}
return shapesToReturn;
}
void** allocateDeviceMemory(std::vector<nvinfer1::Dims> shapes) {
void** buffers = new void*[shapes.size()];
size_t freeMem, totalMem;
for (size_t i = 0; i < shapes.size(); i++) {
auto shape = shapes[i];
size_t totalSize = getTotalSize(shape);
size_t aligned_memory_size = ALIGN_TO(totalSize * sizeof(float), 256);
std::cout << "Aligned memory size: " << aligned_memory_size << std::endl;
cudaMemGetInfo(&freeMem, &totalMem);
std::cout << "Free memory: " << freeMem << " / Total memory: " << totalMem << std::endl;
// Allocate memory
cudaError_t err = cudaMalloc(reinterpret_cast<void**>(&buffers[i]), aligned_memory_size);
if (err != cudaSuccess) {
gLogger.log(nvinfer1::ILogger::Severity::kERROR, "Failed to allocate device memory.");
std::cerr << "cudaMalloc failed for buffer " << i << " with error: " << cudaGetErrorString(err) << std::endl;
exit(1);
}
// Set all values to 0
int value = 13;
err = cudaMemset(buffers[i], value, aligned_memory_size);
if (err != cudaSuccess) {
gLogger.log(nvinfer1::ILogger::Severity::kERROR, "Failed to set device memory.");
std::cerr << "cudaMemset failed for buffer " << i << " with error: " << cudaGetErrorString(err) << std::endl;
exit(1);
}
// Verify that the buffer is not null
if (buffers[i] == nullptr) {
gLogger.log(nvinfer1::ILogger::Severity::kERROR, "Buffer is null after allocation.");
std::cerr << "Buffer " << i << " is null after allocation." << std::endl;
exit(1);
}
// Get the number of elements in the shape
int numElements = getTotalSize(shape); // Ensure this correctly gives the number of elements
// Print the allocated memory
std::cout << "Printing blob for buffer " << i << std::endl;
int* host_data = new int[10];
cudaMemcpy(host_data, buffers[i], 10 * sizeof(int), cudaMemcpyDeviceToHost);
printBlob(host_data, 10, sizeof(int), "int");
std::cout << "Done printing blob for buffer " << i << std::endl;
}
return buffers;
}
int main() {
// Initialize TensorRT runtime
nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(gLogger);
if (!runtime) {
gLogger.log(nvinfer1::ILogger::Severity::kERROR, "Failed to create TensorRT runtime.");
return 1;
}
// Load engine file
std::vector<char> engineData = loadEngineFile("/workspace/onnx_to_tensorrt/super_resolution.engine");
// Deserialize engine
nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(engineData.data(), engineData.size());
// List out inputs / outputs for the baseline model
std::vector<std::string> inputTensorNames = {
"input",
};
std::vector<std::string> outputTensorNames = {
"output",
};
// Make sure inputs are correct
gLogger.log(nvinfer1::ILogger::Severity::kINFO, "Input tensor names:");
std::vector<nvinfer1::Dims> inputShapes = getTensorShapes(inputTensorNames, engine);
// Make sure outputs are correct
gLogger.log(nvinfer1::ILogger::Severity::kINFO, "Output tensor names:");
std::vector<nvinfer1::Dims> outputShapes = getTensorShapes(outputTensorNames, engine);
// Allocate device memory for inputs and outputs
void* const* inputBuffers = allocateDeviceMemory(inputShapes);
void* const* outputBuffers = allocateDeviceMemory(outputShapes); // We need this as well
// Debug: Log allocated memory addresses and sizes
nvinfer1::IExecutionContext* context = engine->createExecutionContext();
for (size_t i = 0; i < inputShapes.size(); ++i) {
size_t inputSize = getTotalSize(inputShapes[i]);
auto const name = inputTensorNames[i];
if (!context->setTensorAddress(name.c_str(), inputBuffers[i])) {
gLogger.log(nvinfer1::ILogger::Severity::kERROR, "Failed to set tensor address.");
exit(1);
}
gLogger.log(nvinfer1::ILogger::Severity::kINFO, ("Successfully bound input buffer \"" + inputTensorNames[i] + "\" address: " + std::to_string(reinterpret_cast<uintptr_t>(inputBuffers[i])) + ", size: " + std::to_string(inputSize)).c_str());
int* host_data = new int[10];
cudaMemcpy(host_data, inputBuffers[i], 10 * sizeof(int), cudaMemcpyDeviceToHost);
printBlob(host_data, 10, sizeof(int), "int");
}
for (size_t i = 0; i < outputShapes.size(); ++i) {
size_t outputSize = getTotalSize(outputShapes[i]);
auto const name = outputTensorNames[i];
if (!context->setTensorAddress(name.c_str(), outputBuffers[i])) {
gLogger.log(nvinfer1::ILogger::Severity::kERROR, "Failed to set tensor address.");
exit(1);
}
gLogger.log(nvinfer1::ILogger::Severity::kINFO, ("Successfully bound output buffer \"" + name + "\" address: " + std::to_string(reinterpret_cast<uintptr_t>(outputBuffers[i])) + ", size: " + std::to_string(outputSize)).c_str());
int* host_data = new int[10];
cudaMemcpy(host_data, outputBuffers[i], 10 * sizeof(int), cudaMemcpyDeviceToHost);
printBlob(host_data, 10, sizeof(int), "int");
}
// Print out the names of the tensors
for (int i = 0; i < engine->getNbIOTensors(); i++) {
nvinfer1::TensorLocation tensor_location = engine->getTensorLocation(engine->getIOTensorName(i));
std::string device = (tensor_location == nvinfer1::TensorLocation::kDEVICE) ? "GPU" : "CPU";
gLogger.log(nvinfer1::ILogger::Severity::kINFO, ("Tensor name: \"" + std::string(engine->getIOTensorName(i)) + "\", device: " + device).c_str());
}
// Execute inference
context->setDebugSync(true);
std::cout << "Debug state: " << context->getDebugSync() << std::endl;
bool executionSuccessful = context->executeV2(inputBuffers);
if (!executionSuccessful) {
gLogger.log(nvinfer1::ILogger::Severity::kERROR, "Inference execution failed.");
} else {
gLogger.log(nvinfer1::ILogger::Severity::kINFO, "Inference execution successful!!!!!!!!!!!!!!!!!!!!!!!!!");
}
// TODO: Do something with the output data
// Copy output data to host
// cudaMemcpy(hostOutputData, buffers[outputIndex], outputSize * sizeof(float), cudaMemcpyDeviceToHost);
// // Release resources
for (size_t i = 0; i < inputShapes.size(); i++) {
cudaFree(inputBuffers[i]);
}
for (size_t i = 0; i < outputShapes.size(); i++) {
cudaFree(outputBuffers[i]);
}
delete[] inputBuffers;
delete[] outputBuffers;
// context->destroy();
// engine->destroy();
// runtime->destroy();
return 0;
}Environment
I'm running the code in a container provided in this repo (but the 10.7 release, check details below), launching it with:
./docker/launch.sh --tag tensorrt-ubuntu20.04-cuda12.6 --gpus all
TensorRT Version: 10.7, I'm at the following commit hash:
97ff244 (HEAD -> release/10.7, origin/release/10.7, origin/HEAD) Add 2025 Q1 roadmap (#4279)
NVIDIA GPU: T1200
NVIDIA Driver Version: 535.183.01
CUDA Version: 12.6
CUDNN Version: 8.9.6.50 (I got it from the ubuntu-20.04.Dockerfile: https://github.com/NVIDIA/TensorRT/blob/release/10.7/docker/ubuntu-22.04.Dockerfile)
Operating System: Ubuntu 20.04 (not just the container, I mean my host OS is the same)
Python Version (if applicable): Not applicable.
Tensorflow Version (if applicable): Not applicable.
PyTorch Version (if applicable): Not applicable.
Baremetal or Container (if so, version): https://github.com/NVIDIA/TensorRT/blob/release/10.7/docker/ubuntu-22.04.Dockerfile
Relevant Files
I load in the code a super_resolution.engine file, I supplied it below.
Model link: https://drive.google.com/file/d/1c02RKBQDTJ-mo9WYh93xZy5nokEwJx_r/view?usp=sharing
Steps To Reproduce
To build the code I use the following CMakeLists.txt:
cmake_minimum_required(VERSION 3.16)
project(tensorrt_minimalistic)
# Set C++ standard and optimization flags
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Ofast -DNDEBUG -Wno-deprecated-declarations")
# For finding FindTensorRT.cmake
set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
# Specify the path to TensorRT root directory (modify as needed)
if (NOT TensorRT_DIR)
set(TensorRT_DIR /workspace/TensorRT/)
endif()
# Set CUDA root directory (modify as needed)
set(CUDA_TOOLKIT_ROOT_DIR /usr/local/cuda)
# Include TensorRT and CUDA
find_package(TensorRT REQUIRED)
find_package(CUDA REQUIRED)
# Add include directories
include_directories(${CUDA_INCLUDE_DIRS} ${TensorRT_INCLUDE_DIRS} include)
# Add the main executable
add_executable(main main.cpp)
# Link TensorRT and CUDA libraries
target_link_libraries(dupa PUBLIC ${CUDA_LIBRARIES} ${TensorRT_LIBRARIES})and build it with the standard:
mkdir build; cd build; cmake ..; makeand then call it with:
./mainwhich gives me the following output:
[TensorRT] Loaded engine size: 0 MiB
[TensorRT] Input tensor names:
[TensorRT] input, dim: (1, 1, 224, 224), total_size = 50176
[TensorRT] Output tensor names:
[TensorRT] output, dim: (1, 1, 672, 672), total_size = 451584
Aligned memory size: 200704
Free memory: 3885301760 / Total memory: 4093509632
Printing blob for buffer 0
218959117 218959117 218959117 218959117 218959117 218959117 218959117 218959117 218959117 218959117
Done printing blob for buffer 0
Aligned memory size: 1806336
Free memory: 3885301760 / Total memory: 4093509632
Printing blob for buffer 0
218959117 218959117 218959117 218959117 218959117 218959117 218959117 218959117 218959117 218959117
Done printing blob for buffer 0
[TensorRT] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +25, now: CPU 0, GPU 25 (MiB)
[TensorRT] Successfully bound input buffer "input" address: 140083909607936, size: 50176
218959117 218959117 218959117 218959117 218959117 218959117 218959117 218959117 218959117 218959117
[TensorRT] Successfully bound output buffer "output" address: 140083910934528, size: 451584
218959117 218959117 218959117 218959117 218959117 218959117 218959117 218959117 218959117 218959117
[TensorRT] Tensor name: "input", device: GPU
[TensorRT] Tensor name: "output", device: GPU
Debug state: 1
[TensorRT] IExecutionContext::executeV2: Error Code 3: API Usage Error (Parameter check failed, condition: nullPtrAllowed. Tensor "output" is bound to nullptr, which is allowed only for an empty input tensor, shape tensor, or an output tensor associated with an IOuputAllocator.)
[TensorRT] Inference execution failed.
Commands or scripts: ./main (see details about building above)
Have you tried the latest release?: No, but I just noticed the 10.8 release. Should I give it a try?
Can this model run on other frameworks? For example run ONNX model with ONNXRuntime (polygraphy run <model.onnx> --onnxrt): No idea.