Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
models
build
*.wts
*.engine
Expand Down
6 changes: 6 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,21 @@ repos:
rev: v4.5.0
hooks:
- id: check-merge-conflict
types: [python]
- id: check-symlinks
types: [python]
- id: end-of-file-fixer
types: [python]
- id: trailing-whitespace
types: [python]
- id: check-added-large-files
types: [python]
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v14.0.6
hooks:
- id: clang-format
types_or: [c++, c, cuda]
args: [-style=file]
- repo: https://github.com/PyCQA/flake8
rev: 7.0.0
hooks:
Expand Down
263 changes: 163 additions & 100 deletions README.md

Large diffs are not rendered by default.

56 changes: 38 additions & 18 deletions alexnet/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,25 +1,45 @@
cmake_minimum_required(VERSION 2.6)
cmake_minimum_required(VERSION 3.14)

project(alexnet)
project(
alexnet
VERSION 0.1
LANGUAGES C CXX CUDA)

add_definitions(-std=c++11)
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES
60
70
72
75
80
86
89)
endif()

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_INCLUDE_CURRENT_DIR TRUE)

include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
# tensorrt
include_directories(/usr/include/x86_64-linux-gnu/)
link_directories(/usr/lib/x86_64-linux-gnu/)
option(CUDA_USE_STATIC_CUDA_RUNTIME "Use static cudaruntime library" OFF)

add_executable(alexnet ${PROJECT_SOURCE_DIR}/alex.cpp)
target_link_libraries(alexnet nvinfer)
target_link_libraries(alexnet cudart)
find_package(Threads REQUIRED)
find_package(CUDAToolkit REQUIRED)
find_package(OpenCV REQUIRED)

add_definitions(-O2 -pthread)
if(NOT TARGET TensorRT::TensorRT)
include(FindTensorRT.cmake)
else()
message("TensorRT has been found, skipping for ${PROJECT_NAME}")
endif()

add_executable(${PROJECT_NAME} alexnet.cc)

target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR}
${OpenCV_INCLUDE_DIRS})

target_link_libraries(
${PROJECT_NAME} PRIVATE Threads::Threads TensorRT::TensorRT CUDA::cudart
${OpenCV_LIBS})
121 changes: 121 additions & 0 deletions alexnet/FindTensorRT.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
cmake_minimum_required(VERSION 3.17.0)

set(TRT_VERSION
$ENV{TRT_VERSION}
CACHE
STRING
"TensorRT version, e.g. \"8.6.1.6\" or \"8.6.1.6+cuda12.0.1.011\", etc")

function(_guess_path var_name required_files)
set(_result "")

foreach(path_entry IN LISTS ARGN)
if(NOT EXISTS "${path_entry}")
message(DEBUG "skip non-existing path '${path_entry}'")
continue()
endif()

set(_ok TRUE)
foreach(required_file IN LISTS required_files)
if(NOT EXISTS "${path_entry}/${required_file}")
set(_ok FALSE)
message(DEBUG "'${path_entry}' missing '${required_file}'")
break()
endif()
endforeach()

if(_ok)
list(APPEND _result "${path_entry}")
message(DEBUG "accept '${path_entry}'")
else()
message(DEBUG "reject '${path_entry}'")
endif()
endforeach()

if(_result STREQUAL "")
message(
FATAL_ERROR
"_guess_path(${var_name}) failed: no valid path found. required_files='${required_files}' candidates='${ARGN}'"
)
endif()

set(${var_name}
"${_result}"
PARENT_SCOPE)
endfunction()

# find TensorRT include folder
if(NOT DEFINED TensorRT_INCLUDE_DIR)
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
_guess_path(
TensorRT_INCLUDE_DIR "NvInfer.h" "/usr/include/aarch64-linux-gnu"
"/usr/include" "/usr/local/cuda/targets/aarch64-linux/include")
else()
_guess_path(
TensorRT_INCLUDE_DIR "NvInfer.h"
"/usr/local/tensorrt/targets/x86_64-linux-gnu/include"
"/usr/include/x86_64-linux-gnu" "/usr/include")
endif()
message(STATUS "TensorRT includes: ${TensorRT_INCLUDE_DIR}")
endif()

# find TensorRT library folder
if(NOT TensorRT_LIBRARY_DIR)
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
_guess_path(
TensorRT_LIBRARY_DIR "libnvinfer.so;libnvinfer_plugin.so"
"/usr/lib/aarch64-linux-gnu;/usr/lib/aarch64-linux-gnu/tegra" "/usr/lib")
else()
_guess_path(
TensorRT_LIBRARY_DIR
"libnvinfer.so;libnvinfer_plugin.so"
"/usr/lib/x86_64-linux-gnu;/usr/local/tensorrt/targets/x86_64-linux-gnu/lib;/usr/lib"
)
endif()
message(STATUS "TensorRT libraries: ${TensorRT_LIBRARY_DIR}")
endif()

set(TensorRT_LIBRARIES)

message(STATUS "Found TensorRT lib: ${TensorRT_LIBRARIES}")

# process for different TensorRT version
if(DEFINED TRT_VERSION AND NOT TRT_VERSION STREQUAL "")
string(REGEX MATCH "([0-9]+)" _match ${TRT_VERSION})
set(TRT_MAJOR_VERSION "${_match}")
set(_modules nvinfer nvinfer_plugin)
unset(_match)

if(TRT_MAJOR_VERSION GREATER_EQUAL 8)
list(APPEND _modules nvinfer_vc_plugin nvinfer_dispatch nvinfer_lean)
endif()
else()
message(FATAL_ERROR "Please set a environment variable \"TRT_VERSION\"")
endif()

# find and add all modules of TensorRT into list
foreach(lib IN LISTS _modules)
find_library(
TensorRT_${lib}_LIBRARY
NAMES ${lib}
HINTS ${TensorRT_LIBRARY_DIR})
list(APPEND TensorRT_LIBRARIES ${TensorRT_${lib}_LIBRARY})
endforeach()

# make the "TensorRT target"
add_library(TensorRT IMPORTED INTERFACE)
add_library(TensorRT::TensorRT ALIAS TensorRT)
target_link_libraries(TensorRT INTERFACE ${TensorRT_LIBRARIES})

set_target_properties(
TensorRT
PROPERTIES C_STANDARD 17
CXX_STANDARD 17
POSITION_INDEPENDENT_CODE ON
SKIP_BUILD_RPATH TRUE
BUILD_WITH_INSTALL_RPATH TRUE
INSTALL_RPATH "$ORIGIN"
INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIR}")

unset(TRT_MAJOR_VERSION)
unset(_modules)
105 changes: 91 additions & 14 deletions alexnet/README.md
Original file line number Diff line number Diff line change
@@ -1,33 +1,110 @@
# alexnet

AlexNet model architecture from the "One weird trick..." <https://arxiv.org/abs/1404.5997>`_ paper.
## Introduction

For the details, you can refer to [pytorchx/alexnet](https://github.com/wang-xinyu/pytorchx/tree/master/alexnet)
AlexNet model architecture comes from this paper: [One weird trick for parallelizing convolutional neural networks](https://arxiv.org/abs/1404.5997). To generate `.wts` file, you can refer to [pytorchx/alexnet](https://github.com/wang-xinyu/pytorchx/tree/master/alexnet). To check the pytorch implementation of AlexNet, refer to [HERE](https://github.com/pytorch/vision/blob/main/torchvision/models/alexnet.py#L17)

This alexnet is just several `conv-relu-pool` blocks followed by several `fc-relu`, nothing special. All layers can be implemented by tensorrt api, including `addConvolution`, `addActivation`, `addPooling`, `addFullyConnected`.
AlexNet consists of 3 major parts: features, adaptive average pooling, and classifier:

- features: just several stacked `CRP`(conv-relu-pool) and `CR` layers
- adaptive average pooling: pytorch can decide its inner parameters, but we need to calculate it ourselves in TensorRT API
- classifier: just several `fc-relu` layers. All layers can be implemented by tensorrt api, including `addConvolution`, `addActivation`, `addPooling`, `addMatrixMultiply`, `addElementWise` etc.

## Use AlexNet from PyTorch

We can use torchvision to load the pretrained alexnet model:

```python
alexnet = torchvision.models.alexnet(pretrained=True)
```
// 1. generate alexnet.wts from [pytorchx/alexnet](https://github.com/wang-xinyu/pytorchx/tree/master/alexnet)

// 2. put alexnet.wts into tensorrtx/alexnet
The model structure is:

```bash
AlexNet(
(features): Sequential(
(0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
(1): ReLU(inplace=True)
(2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
(3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
(4): ReLU(inplace=True)
(5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
(6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(7): ReLU(inplace=True)
(8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(9): ReLU(inplace=True)
(10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(11): ReLU(inplace=True)
(12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
)
(avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
(classifier): Sequential(
(0): Dropout(p=0.5, inplace=False)
(1): Linear(in_features=9216, out_features=4096, bias=True)
(2): ReLU(inplace=True)
(3): Dropout(p=0.5, inplace=False)
(4): Linear(in_features=4096, out_features=4096, bias=True)
(5): ReLU(inplace=True)
(6): Linear(in_features=4096, out_features=1000, bias=True)
)
)
```

// 3. build and run
## Usage

cd tensorrtx/alexnet
1. use `gen_wts.py` to generate wts file.

mkdir build
```bash
python3 gen_wts.py
```

cd build
2. build C++ code

cmake ..
```bash
pushd tensorrtx/alexnet
cmake -S . -B build -G Ninja --fresh
cmake --build build
```

make
3. serialize wts model to engine file.

sudo ./alexnet -s // serialize model to plan file i.e. 'alexnet.engine'
```bash
./build/alexnet -s
```

sudo ./alexnet -d // deserialize plan file and run inference
4. run inference

// 4. see if the output is same as pytorchx/alexnet
```bash
./build/alexnet -d
```

output looks like:

```txt
...
====
Execution time: 1ms
0.1234, -0.5678, ...
====
prediction result:
Top: 0 idx: 285, logits: 9.9, label: Egyptian cat
Top: 1 idx: 281, logits: 8.304, label: tabby, tabby cat
Top: 2 idx: 282, logits: 6.859, label: tiger cat
```

## FAQ

### How to align the output with Pytorch?

If your output is different from pytorch, you have to check which TensorRT API or your code cause this. A simple solution would be check the `.engine` output part by part, e.g., you can set the early layer of alexnet as output:

```c++
fc3_1->getOutput(0)->setName(OUTPUT_NAME);
network->markOutput(*pool3->getOutput(0)); // original is: "*fc3_1->getOutput(0)"
```

For this line of code, i use the output from "feature" part of alexnet, ignoring the rest of the model, then, don't forget to change the `OUTPUT_SIZE` macro on top of the file, lastly, build the `.engine` file to apply the changes.

You can sum up all output from C++ code, and compare it with Pytorch output, for Pytorch, you can do this by: `torch.sum(x)` at debug phase. The ideal value deviation between 2 values would be $[10^{-1}, 10^{-2}]$, for this example, since the output elements for "feature" is $256 * 6 * 6$ (bacth = 1), the final error would roughly be $10^{-4}$.

Note: This is a quick check, for more accurate check, you have to save the output tensor into a file to compare them value by value, but this situation is rare.
Loading