wang-xinyu · mpj1234 · Jan 20, 2026 · Jan 15, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
+models
 build
 *.wts
 *.engine

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,15 +3,21 @@ repos:
     rev: v4.5.0
     hooks:
     -   id: check-merge-conflict
+        types: [python]
     -   id: check-symlinks
+        types: [python]
     -   id: end-of-file-fixer
+        types: [python]
     -   id: trailing-whitespace
+        types: [python]
     -   id: check-added-large-files
+        types: [python]
 -   repo: https://github.com/pre-commit/mirrors-clang-format
     rev: v14.0.6
     hooks:
     -   id: clang-format
         types_or: [c++, c, cuda]
+        args: [-style=file]
 -   repo: https://github.com/PyCQA/flake8
     rev: 7.0.0
     hooks:

diff --git a/README.md b/README.md
diff --git a/alexnet/CMakeLists.txt b/alexnet/CMakeLists.txt
@@ -1,25 +1,45 @@
-cmake_minimum_required(VERSION 2.6)
+cmake_minimum_required(VERSION 3.14)
 
-project(alexnet)
+project(
+  alexnet
+  VERSION 0.1
+  LANGUAGES C CXX CUDA)
 
-add_definitions(-std=c++11)
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+  set(CMAKE_CUDA_ARCHITECTURES
+      60
+      70
+      72
+      75
+      80
+      86
+      89)
+endif()
 
-option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
-set(CMAKE_CXX_STANDARD 11)
-set(CMAKE_BUILD_TYPE Debug)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_INCLUDE_CURRENT_DIR TRUE)
 
-include_directories(${PROJECT_SOURCE_DIR}/include)
-# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
-# cuda
-include_directories(/usr/local/cuda/include)
-link_directories(/usr/local/cuda/lib64)
-# tensorrt
-include_directories(/usr/include/x86_64-linux-gnu/)
-link_directories(/usr/lib/x86_64-linux-gnu/)
+option(CUDA_USE_STATIC_CUDA_RUNTIME "Use static cudaruntime library" OFF)
 
-add_executable(alexnet ${PROJECT_SOURCE_DIR}/alex.cpp)
-target_link_libraries(alexnet nvinfer)
-target_link_libraries(alexnet cudart)
+find_package(Threads REQUIRED)
+find_package(CUDAToolkit REQUIRED)
+find_package(OpenCV REQUIRED)
 
-add_definitions(-O2 -pthread)
+if(NOT TARGET TensorRT::TensorRT)
+  include(FindTensorRT.cmake)
+else()
+  message("TensorRT has been found, skipping for ${PROJECT_NAME}")
+endif()
 
+add_executable(${PROJECT_NAME} alexnet.cc)
+
+target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR}
+                                                   ${OpenCV_INCLUDE_DIRS})
+
+target_link_libraries(
+  ${PROJECT_NAME} PRIVATE Threads::Threads TensorRT::TensorRT CUDA::cudart
+                          ${OpenCV_LIBS})
diff --git a/alexnet/FindTensorRT.cmake b/alexnet/FindTensorRT.cmake
@@ -0,0 +1,121 @@
+cmake_minimum_required(VERSION 3.17.0)
+
+set(TRT_VERSION
+    $ENV{TRT_VERSION}
+    CACHE
+      STRING
+      "TensorRT version, e.g. \"8.6.1.6\" or \"8.6.1.6+cuda12.0.1.011\", etc")
+
+function(_guess_path var_name required_files)
+  set(_result "")
+
+  foreach(path_entry IN LISTS ARGN)
+    if(NOT EXISTS "${path_entry}")
+      message(DEBUG "skip non-existing path '${path_entry}'")
+      continue()
+    endif()
+
+    set(_ok TRUE)
+    foreach(required_file IN LISTS required_files)
+      if(NOT EXISTS "${path_entry}/${required_file}")
+        set(_ok FALSE)
+        message(DEBUG "'${path_entry}' missing '${required_file}'")
+        break()
+      endif()
+    endforeach()
+
+    if(_ok)
+      list(APPEND _result "${path_entry}")
+      message(DEBUG "accept '${path_entry}'")
+    else()
+      message(DEBUG "reject '${path_entry}'")
+    endif()
+  endforeach()
+
+  if(_result STREQUAL "")
+    message(
+      FATAL_ERROR
+        "_guess_path(${var_name}) failed: no valid path found. required_files='${required_files}' candidates='${ARGN}'"
+    )
+  endif()
+
+  set(${var_name}
+      "${_result}"
+      PARENT_SCOPE)
+endfunction()
+
+# find TensorRT include folder
+if(NOT DEFINED TensorRT_INCLUDE_DIR)
+  if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+    _guess_path(
+      TensorRT_INCLUDE_DIR "NvInfer.h" "/usr/include/aarch64-linux-gnu"
+      "/usr/include" "/usr/local/cuda/targets/aarch64-linux/include")
+  else()
+    _guess_path(
+      TensorRT_INCLUDE_DIR "NvInfer.h"
+      "/usr/local/tensorrt/targets/x86_64-linux-gnu/include"
+      "/usr/include/x86_64-linux-gnu" "/usr/include")
+  endif()
+  message(STATUS "TensorRT includes: ${TensorRT_INCLUDE_DIR}")
+endif()
+
+# find TensorRT library folder
+if(NOT TensorRT_LIBRARY_DIR)
+  if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+    _guess_path(
+      TensorRT_LIBRARY_DIR "libnvinfer.so;libnvinfer_plugin.so"
+      "/usr/lib/aarch64-linux-gnu;/usr/lib/aarch64-linux-gnu/tegra" "/usr/lib")
+  else()
+    _guess_path(
+      TensorRT_LIBRARY_DIR
+      "libnvinfer.so;libnvinfer_plugin.so"
+      "/usr/lib/x86_64-linux-gnu;/usr/local/tensorrt/targets/x86_64-linux-gnu/lib;/usr/lib"
+    )
+  endif()
+  message(STATUS "TensorRT libraries: ${TensorRT_LIBRARY_DIR}")
+endif()
+
+set(TensorRT_LIBRARIES)
+
+message(STATUS "Found TensorRT lib: ${TensorRT_LIBRARIES}")
+
+# process for different TensorRT version
+if(DEFINED TRT_VERSION AND NOT TRT_VERSION STREQUAL "")
+  string(REGEX MATCH "([0-9]+)" _match ${TRT_VERSION})
+  set(TRT_MAJOR_VERSION "${_match}")
+  set(_modules nvinfer nvinfer_plugin)
+  unset(_match)
+
+  if(TRT_MAJOR_VERSION GREATER_EQUAL 8)
+    list(APPEND _modules nvinfer_vc_plugin nvinfer_dispatch nvinfer_lean)
+  endif()
+else()
+  message(FATAL_ERROR "Please set a environment variable \"TRT_VERSION\"")
+endif()
+
+# find and add all modules of TensorRT into list
+foreach(lib IN LISTS _modules)
+  find_library(
+    TensorRT_${lib}_LIBRARY
+    NAMES ${lib}
+    HINTS ${TensorRT_LIBRARY_DIR})
+  list(APPEND TensorRT_LIBRARIES ${TensorRT_${lib}_LIBRARY})
+endforeach()
+
+# make the "TensorRT target"
+add_library(TensorRT IMPORTED INTERFACE)
+add_library(TensorRT::TensorRT ALIAS TensorRT)
+target_link_libraries(TensorRT INTERFACE ${TensorRT_LIBRARIES})
+
+set_target_properties(
+  TensorRT
+  PROPERTIES C_STANDARD 17
+             CXX_STANDARD 17
+             POSITION_INDEPENDENT_CODE ON
+             SKIP_BUILD_RPATH TRUE
+             BUILD_WITH_INSTALL_RPATH TRUE
+             INSTALL_RPATH "$ORIGIN"
+             INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIR}")
+
+unset(TRT_MAJOR_VERSION)
+unset(_modules)
diff --git a/alexnet/README.md b/alexnet/README.md
@@ -1,33 +1,110 @@
 # alexnet
 
-AlexNet model architecture from the "One weird trick..." <https://arxiv.org/abs/1404.5997>`_ paper.
+## Introduction
 
-For the details, you can refer to [pytorchx/alexnet](https://github.com/wang-xinyu/pytorchx/tree/master/alexnet)
+AlexNet model architecture comes from this paper: [One weird trick for parallelizing convolutional neural networks](https://arxiv.org/abs/1404.5997). To generate `.wts` file, you can refer to [pytorchx/alexnet](https://github.com/wang-xinyu/pytorchx/tree/master/alexnet). To check the pytorch implementation of AlexNet, refer to [HERE](https://github.com/pytorch/vision/blob/main/torchvision/models/alexnet.py#L17)
 
-This alexnet is just several `conv-relu-pool` blocks followed by several `fc-relu`, nothing special. All layers can be implemented by tensorrt api, including `addConvolution`, `addActivation`, `addPooling`, `addFullyConnected`.
+AlexNet consists of 3 major parts: features, adaptive average pooling, and classifier:
 
+- features: just several stacked `CRP`(conv-relu-pool) and `CR` layers
+- adaptive average pooling: pytorch can decide its inner parameters, but we need to calculate it ourselves in TensorRT API
+- classifier: just several `fc-relu` layers. All layers can be implemented by tensorrt api, including `addConvolution`, `addActivation`, `addPooling`, `addMatrixMultiply`, `addElementWise` etc.
+
+## Use AlexNet from PyTorch
+
+We can use torchvision to load the pretrained alexnet model:
+
+```python
+alexnet = torchvision.models.alexnet(pretrained=True)
 ```
-// 1. generate alexnet.wts from [pytorchx/alexnet](https://github.com/wang-xinyu/pytorchx/tree/master/alexnet)
 
-// 2. put alexnet.wts into tensorrtx/alexnet
+The model structure is:
+
+```bash
+AlexNet(
+  (features): Sequential(
+    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
+    (1): ReLU(inplace=True)
+    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
+    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
+    (4): ReLU(inplace=True)
+    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
+    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+    (7): ReLU(inplace=True)
+    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+    (9): ReLU(inplace=True)
+    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+    (11): ReLU(inplace=True)
+    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
+  )
+  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
+  (classifier): Sequential(
+    (0): Dropout(p=0.5, inplace=False)
+    (1): Linear(in_features=9216, out_features=4096, bias=True)
+    (2): ReLU(inplace=True)
+    (3): Dropout(p=0.5, inplace=False)
+    (4): Linear(in_features=4096, out_features=4096, bias=True)
+    (5): ReLU(inplace=True)
+    (6): Linear(in_features=4096, out_features=1000, bias=True)
+  )
+)
+```
 
-// 3. build and run
+## Usage
 
-cd tensorrtx/alexnet
+1. use `gen_wts.py` to generate wts file.
 
-mkdir build
+```bash
+python3 gen_wts.py
+```
 
-cd build
+2. build C++ code
 
-cmake ..
+```bash
+pushd tensorrtx/alexnet
+cmake -S . -B build -G Ninja --fresh
+cmake --build build
+```
 
-make
+3. serialize wts model to engine file.
 
-sudo ./alexnet -s   // serialize model to plan file i.e. 'alexnet.engine'
+```bash
+./build/alexnet -s
+```
 
-sudo ./alexnet -d   // deserialize plan file and run inference
+4. run inference
 
-// 4. see if the output is same as pytorchx/alexnet
+```bash
+./build/alexnet -d
 ```
 
+output looks like:
+
+```txt
+...
+====
+Execution time: 1ms
+0.1234, -0.5678, ...
+====
+prediction result:
+Top: 0 idx: 285, logits: 9.9, label: Egyptian cat
+Top: 1 idx: 281, logits: 8.304, label: tabby, tabby cat
+Top: 2 idx: 282, logits: 6.859, label: tiger cat
+```
+
+## FAQ
+
+### How to align the output with Pytorch?
+
+If your output is different from pytorch, you have to check which TensorRT API or your code cause this. A simple solution would be check the `.engine` output part by part, e.g., you can set the early layer of alexnet as output:
+
+```c++
+fc3_1->getOutput(0)->setName(OUTPUT_NAME);
+network->markOutput(*pool3->getOutput(0)); // original is: "*fc3_1->getOutput(0)"
+```
+
+For this line of code, i use the output from "feature" part of alexnet, ignoring the rest of the model, then, don't forget to change the `OUTPUT_SIZE` macro on top of the file, lastly, build the `.engine` file to apply the changes.
+
+You can sum up all output from C++ code, and compare it with Pytorch output, for Pytorch, you can do this by: `torch.sum(x)` at debug phase. The ideal value deviation between 2 values would be $[10^{-1}, 10^{-2}]$, for this example, since the output elements for "feature" is $256 * 6 * 6$ (bacth = 1), the final error would roughly be $10^{-4}$.
 
+Note: This is a quick check, for more accurate check, you have to save the output tensor into a file to compare them value by value, but this situation is rare.
-Original file line number
+Diff line change
@@ -1,3 +1,4 @@
+    models
     build
     *.wts
     *.engine
@@ Expand Down @@