Merge github.com:shouxieai/tensorRT_cpp

happy · happy · commit 7eb1776a5fb2 · 2021-09-12T17:58:56.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -67,4 +67,5 @@ __pycache__
 
 __pycache__
 
-!/workspace/wget.exe
+!/workspace/wget.exe
+/workspace/*.mp4
diff --git a/.vscode/tasks.json b/.vscode/tasks.json
@@ -7,6 +7,9 @@
             "label": "build",
             "type": "shell",
             "command": "make pro -j25"
+
+            // for cmake
+            //"command": "cd build && make pro -j25"
         }
     ]
 }
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -8,7 +8,10 @@ set(CMAKE_BUILD_TYPE Debug)
 set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/workspace)
 set(HAS_PYTHON ON)
 
-# 取决于你的设备，这是编译cu文件的配置，一般可以不用设置，除非你调用了特定函数例如half新特性
+# 如果要支持python则设置python路径
+set(PythonRoot "/data/datav/newbb/lean/anaconda3/envs/torch1.8")
+set(PythonName "python3.9")
+
 # 如果你是不同显卡，请设置为显卡对应的号码参考这里：https://developer.nvidia.com/zh-cn/cuda-gpus#compute
 set(CUDA_GEN_CODE "-gencode=arch=compute_75,code=sm_75")
 
@@ -27,10 +30,6 @@ set(TENSORRT_DIR "/data/sxai/lean/TensorRT-8.0.1.6-cuda10.2-cudnn8.2")
 # 因为protobuf，需要用特定版本，所以这里指定路径
 set(PROTOBUF_DIR "/data/sxai/lean/protobuf3.11.4")
 
-# 如果要支持python则设置python路径
-set(PythonDIR "/data/datav/newbb/lean/anaconda3/envs/torch1.8")
-set(PythonName "python3.9")
-
 
 find_package(CUDA REQUIRED)
 find_package(OpenCV)
@@ -57,9 +56,9 @@ link_directories(
 )
 
 if("${HAS_PYTHON}" STREQUAL "ON")
-    message("Usage Python ${PythonDIR}")
-    include_directories(${PythonDIR}/include/${PythonName})
-    link_directories(${PythonDIR}/lib)
+    message("Usage Python ${PythonRoot}")
+    include_directories(${PythonRoot}/include/${PythonName})
+    link_directories(${PythonRoot}/lib)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAS_PYTHON")
 endif()
 
diff --git a/Makefile b/Makefile
@@ -26,8 +26,9 @@ lean_tensor_rt := /data/sxai/lean/TensorRT-8.0.1.6-cuda10.2-cudnn8.2
 lean_cudnn     := /data/sxai/lean/cudnn8.2.2.26
 lean_opencv    := /data/sxai/lean/opencv4.2.0
 lean_cuda      := /data/sxai/lean/cuda10.2
-lean_python    := /data/datav/newbb/lean/anaconda3/envs/torch1.8
 use_python     := true
+python_root    := /data/datav/newbb/lean/anaconda3/envs/torch1.8
+python_name    := python3.9
 
 include_paths := src        \
 			src/application \
@@ -42,7 +43,7 @@ include_paths := src        \
 library_paths := $(lean_protobuf)/lib \
 			$(lean_opencv)/lib    \
 			$(lean_tensor_rt)/lib \
-			$(lean_cuda)/lib  \
+			$(lean_cuda)/lib64  \
 			$(lean_cudnn)/lib
 
 link_librarys := opencv_core opencv_imgproc opencv_videoio opencv_imgcodecs \
@@ -55,9 +56,9 @@ link_librarys := opencv_core opencv_imgproc opencv_videoio opencv_imgcodecs \
 support_define    := 
 
 ifeq ($(use_python), true) 
-include_paths  += $(lean_python)/include/python3.9
-library_paths  += $(lean_python)/lib
-link_librarys  += python3.9
+include_paths  += $(python_root)/include/$(python_name)
+library_paths  += $(python_root)/lib
+link_librarys  += $(python_name)
 support_define += -DHAS_PYTHON
 endif
 
@@ -67,6 +68,7 @@ library_paths := $(foreach item,$(library_paths),-L$(item))
 link_librarys := $(foreach item,$(link_librarys),-l$(item))
 
 # 如果是其他显卡，请修改-gencode=arch=compute_75,code=sm_75为对应显卡的能力
+# 显卡对应的号码参考这里：https://developer.nvidia.com/zh-cn/cuda-gpus#compute
 cpp_compile_flags := -std=c++11 -fPIC -m64 -g -fopenmp -w -O0 $(support_define)
 cu_compile_flags  := -std=c++11 -m64 -Xcompiler -fPIC -g -w -gencode=arch=compute_75,code=sm_75 -O0 $(support_define)
 link_flags        := -pthread -fopenmp -Wl,-rpath='$$ORIGIN'
diff --git a/README.md b/README.md
@@ -1,3 +1,4 @@
+##
 ## B站同步视频讲解
 - https://www.bilibili.com/video/BV1Xw411f7FW
 - 相关PPTX下载：http://zifuture.com:1556/fs/sxai/tensorRT.pptx
@@ -14,7 +15,7 @@
 3. TensorRT.vcxproj文件中，修改`<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 10.0.targets" />`为你配置的CUDA路径
 4. TensorRT.vcxproj文件中，修改`<CodeGeneration>compute_61,sm_61</CodeGeneration>`为你显卡配备的计算能力
     - 根据型号参考这里：https://developer.nvidia.com/zh-cn/cuda-gpus#compute
-5. 配置依赖，或者下载依赖到lean中。配置VC++目录->包含目录和引用目录
+5. 配置依赖或者下载依赖到lean中。配置VC++目录->包含目录和引用目录
 6. 配置环境，调试->环境，设置PATH路径
 7. 编译并运行案例
 
@@ -27,7 +28,7 @@ image  = cv2.imread("inference/car.jpg")
 bboxes = yolo.commit(image).get()
 ```
 
-- Pytorch无缝对接
+- Pytorch的无缝对接
 ```python
 model     = models.resnet18(True).eval().to(device)
 trt_model = tp.convert_torch_to_trt(model, input)
@@ -76,31 +77,7 @@ auto box = engine->commit(image).get();
 ## 效果图
 ![](workspace/yq.jpg)
 
-## YoloV5-ONNX推理支持-第一种，使用提供的onnx
-- 这个yolov5m.onnx模型使用官方最新版本直接导出得到
-- CMake
-    - 在CMakeLists.txt中配置依赖路径tensorRT、cuda、cudnn、protobuf
-    ```bash
-    git clone git@github.com:shouxieai/tensorRT_cpp.git
-    cd tensorRT_cpp
-
-    mkdir build
-    cd build
-    cmake ..
-    make yolo -j32
-
-    # 或者make alphapose -j32
-    ```
-
-- Makefile
-    - 在Makefile中配置好依赖的tensorRT、cuda、cudnn、protobuf
-    ```bash
-    git clone git@github.com:shouxieai/tensorRT_cpp.git
-    cd tensorRT_cpp
-    make yolo -j32
-    ```
-
-## YoloV5-ONNX推理支持-第二种，自行从官方导出onnx
+## YoloV5支持
 - yolov5的onnx，你的pytorch版本>=1.7时，导出的onnx模型可以直接被当前框架所使用
 - 你的pytorch版本低于1.7时，或者对于yolov5其他版本（2.0、3.0、4.0），可以对opset进行简单改动后直接被框架所支持
 - 如果你想实现低版本pytorch的tensorRT推理、动态batchsize等更多更高级的问题，请打开我们[博客地址](http://zifuture.com:8090)后找到二维码进群交流
@@ -137,7 +114,7 @@ torch.onnx.export(dynamic_axes={'images': {0: 'batch'},  # shape(1,3,640,640)
 3. 导出onnx模型
 ```bash
 cd yolov5
-python export.py --weights=yolov5s.pt --dynamic --opset=11
+python export.py --weights=yolov5s.pt --dynamic --include=onnx --opset=11
 ```
 4. 复制模型并执行
 ```bash
@@ -146,6 +123,7 @@ cd tensorRT_cpp
 make yolo -j32
 ```
 
+
 ## YoloX的支持
 - https://github.com/Megvii-BaseDetection/YOLOX
 - 你可以选择直接make run，会从镜像地址下载onnx并推理运行看到效果。不需要自行导出
@@ -190,6 +168,7 @@ model.head.decode_in_inference = True
 
 3. 导出onnx模型
 ```bash
+
 # 下载模型，或许你需要翻墙
 # wget https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_m.pth
 
@@ -335,6 +314,7 @@ auto int8process = [](int current, int count, vector<string>& images, shared_ptr
     }
 };
 
+
 // 编译模型指定为INT8
 auto model_file = "yolov5m.int8.trtmodel";
 TRT::compile(
@@ -363,11 +343,11 @@ engine->print();
 // 加载图像
 auto image = imread("demo.jpg");
 
-// 获取模型的输入和输出tensor节点，可以根据名字或者索引获取第几个
+// 获取模型的输入和输出tensor节点，可以根据名字或者索引获取具体第几个
 auto input = engine->input(0);
 auto output = engine->output(0);
 
-// 把图像塞到input tensor中，这里是减去均值，除以标准差
+// 把图像塞到input tensor中，这里是减去均值，并除以标准差
 float mean[] = {0, 0, 0};
 float std[]  = {1, 1, 1};
 input->set_norm_mat(i, image, mean, std);
@@ -409,6 +389,7 @@ int HSwish::enqueue(const std::vector<GTensor>& inputs, std::vector<GTensor>& ou
     return 0;
 }
 
+
 RegisterPlugin(HSwish);
 ```
 
@@ -461,9 +442,9 @@ Engine 0x23dd7780 detail
 [2021-07-22 14:37:42][info][_main.cpp:124]:outputs[0].size = 2
 [2021-07-22 14:37:42][info][_main.cpp:124]:outputs[1].size = 5
 [2021-07-22 14:37:42][info][_main.cpp:124]:outputs[2].size = 1
-```
 
+```
 
 ## 关于
 - 我们的博客地址：http://www.zifuture.com:8090/
-- 我们的B站地址：https://space.bilibili.com/1413433465
+- 我们的B站地址： https://space.bilibili.com/1413433465
diff --git a/TensorRT.vcxproj b/TensorRT.vcxproj
@@ -31,6 +31,7 @@
     <ClCompile Include="src\application\app_scrfd.cpp" />
     <ClCompile Include="src\application\app_scrfd\scrfd.cpp" />
     <ClCompile Include="src\application\app_yolo.cpp" />
+    <ClCompile Include="src\application\app_plugin.cpp" />
     <ClCompile Include="src\application\app_lesson.cpp" />
     <ClCompile Include="src\application\app_yolo\yolo.cpp" />
     <ClCompile Include="src\application\tools\auto_download.cpp" />
diff --git a/TensorRT.vcxproj.filters b/TensorRT.vcxproj.filters
@@ -154,6 +154,9 @@
     <ClCompile Include="src\application\app_arcface.cpp">
       <Filter>src\application</Filter>
     </ClCompile>
+    <ClCompile Include="src\application\app_plugin.cpp">
+      <Filter>src\application</Filter>
+    </ClCompile>
     <ClCompile Include="src\application\app_retinaface\retinaface.cpp">
       <Filter>src\application\app_retinaface</Filter>
     </ClCompile>
diff --git a/python/test_yolov5.py b/python/test_yolov5.py
@@ -23,4 +23,5 @@
 saveto = "single_inference/yolov5.car.jpg"
 print(f"Save to {saveto}")
 
-cv2.imwrite(saveto, image)
+cv2.imwrite(saveto, image)
+
diff --git a/src/application/app_arcface.cpp b/src/application/app_arcface.cpp
@@ -6,6 +6,7 @@
 #include "app_arcface/arcface.hpp"
 #include "tools/deepsort.hpp"
 #include "tools/zmq_remote_show.hpp"
+#include <unordered_map>
 
 using namespace std;
 using namespace cv;
diff --git a/src/application/app_lesson.cpp b/src/application/app_lesson.cpp
@@ -3,9 +3,64 @@
 #include <infer/trt_infer.hpp>
 #include <builder/trt_builder.hpp>
 #include "app_yolo/yolo.hpp"
+#include <cuda_runtime.h>
 
 using namespace std;
 
+// static void test_tensor1(){
+
+//     size_t cpu_bytes = 1024;
+//     size_t gpu_bytes = 2048;
+
+//     ///////////////////////////////////////////////////////////////////
+//     // 封装效果，自动分配和释放
+//     TRT::MixMemory memory;
+//     void* host_ptr   = memory.cpu(cpu_bytes);
+//     void* device_ptr = memory.gpu(gpu_bytes);
+
+//     ///////////////////////////////////////////////////////////////////
+//     // 不封装效果
+//     void* host_ptr   = nullptr;
+//     void* device_ptr = nullptr;
+//     cudaMallocHost(&host_ptr, cpu_bytes);
+//     cudaMalloc(&device_ptr, gpu_bytes);
+
+//     cudaFreeHost(&host_ptr);
+//     cudaFree(&device_ptr);
+//     ///////////////////////////////////////////////////////////////////
+// }
+
+static void test_tensor2(){
+
+    ///////////////////////////////////////////////////////////////////
+    // 内存的自动复制，依靠head属性标记数据最新的位置
+    // 若访问的数据不是最新的，则会自动发生复制操作
+    TRT::Tensor tensor({1, 3, 5, 5}, TRT::DataType::Float);
+    INFO("tensor.head = %s", TRT::data_head_string(tensor.head()));   // 输出 Init，内存没有分配
+
+    tensor.cpu<float>()[0] = 512;               // 访问cpu时，分配cpu内存
+    INFO("tensor.head = %s", TRT::data_head_string(tensor.head()));   // 输出 Host
+
+    float* device_ptr = tensor.gpu<float>();    // 访问gpu时，最新数据在Host，发生复制动作并标记最新数据在Device
+    INFO("tensor.head = %s", TRT::data_head_string(tensor.head()));   // 输出 Device
+    INFO("device_ptr[0] = %f", device_ptr[0]);                        // 输出 512.00000
+}
+
+static void test_tensor3(){
+
+    ///////////////////////////////////////////////////////////////////
+    // 计算维度的偏移量
+    TRT::Tensor tensor({1, 3, 5, 5, 2, 5}, TRT::DataType::Float);
+    auto ptr_origin   = tensor.cpu<float>();
+    auto ptr_channel2 = tensor.cpu<float>(0, 2, 3, 2, 1, 3);
+
+    INFO("Offset = %d", ptr_channel2 - ptr_origin);                          // 输出678
+    INFO("Offset = %d", tensor.offset(0, 2, 3, 2, 1, 3));                    // 输出678
+
+    int offset_compute = ((((0 * 3 + 2) * 5 + 3) * 5 + 2) * 2 + 1) * 5 + 3;  
+    INFO("Compute = %d", offset_compute);                                    // 输出678
+}
+
 static void lesson1(){
 
     /** 模型编译，onnx到trtmodel **/
@@ -180,19 +235,11 @@ void lesson_cache1frame(){
 int app_lesson(){
 
     iLogger::set_log_level(iLogger::LogLevel::Verbose);
-    lesson1();
+    test_tensor3();
+    // lesson1();
     // lesson2();
     // lesson3();
     // lesson_cache1frame();
     return 0;
 }
 
-
-
-
-
-
-
-
-
- 
diff --git a/src/application/app_yolo.cpp b/src/application/app_yolo.cpp
@@ -169,11 +169,47 @@ static void test(Yolo::Type type, TRT::Mode mode, const string& model){
     forward_engine(model_file, type);
 }
 
+void my_test(){
+
+    TRT::compile(
+        TRT::Mode::FP32,
+        5,
+        "/data/sxai/temp/yolov5-5.0/yolov5s.onnx",
+        "my-yolov5-5.0s.trtmodel"
+    );
+    INFO("Done");
+
+    auto yolo = Yolo::create_infer(
+        "my-yolov5-5.0s.trtmodel", 
+        Yolo::Type::V5,
+        0, 0.25f, 0.5f
+    );
+
+    auto image = cv::imread("/data/sxai/tensorRT/workspace/inference/car.jpg");
+    auto bboxes = yolo->commits({image, image})[1].get();
+
+    for(auto& box : bboxes){
+
+        uint8_t r, g, b;
+        tie(r, g, b) = iLogger::random_color(box.class_label);
+
+        cv::rectangle(
+            image, 
+            cv::Point(box.left, box.top),
+            cv::Point(box.right, box.bottom),
+            cv::Scalar(b, g, r),
+            3
+        );
+    }
+    cv::imwrite("my-yolov5s-car.jpg", image);
+}
+
 int app_yolo(){
 
+    // my_test();
     //iLogger::set_log_level(iLogger::LogLevel::Info);
-    //test(Yolo::Type::X, TRT::Mode::FP32, "yolox_m");
-    //test(Yolo::Type::V5, TRT::Mode::FP32, "yolov5s");
+    test(Yolo::Type::X, TRT::Mode::FP32, "yolox_m");
+    // test(Yolo::Type::V5, TRT::Mode::FP32, "yolov5s");
     // test(Yolo::Type::X, TRT::Mode::FP16, "yolox_s");
     // test(Yolo::Type::V5, TRT::Mode::FP16, "yolov5s");
     // test_int8(Yolo::Type::X, "yolox_s");
diff --git a/src/tensorRT/common/trt_tensor.hpp b/src/tensorRT/common/trt_tensor.hpp
diff --git a/src/tensorRT/onnx_parser/NvOnnxParser.h b/src/tensorRT/onnx_parser/NvOnnxParser.h
diff --git a/src/tensorRT/onnxplugin/plugins/DCNv2.cu b/src/tensorRT/onnxplugin/plugins/DCNv2.cu
diff --git a/workspace/my-yolov5s-car.jpg b/workspace/my-yolov5s-car.jpg

Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,9 @@`
`7`	`7`	`"label": "build",`
`8`	`8`	`"type": "shell",`
`9`	`9`	`"command": "make pro -j25"`
	`10`	`+`
	`11`	`+ // for cmake`
	`12`	`+ //"command": "cd build && make pro -j25"`
`10`	`13`	`}`
`11`	`14`	`]`
`12`	`15`	`}`