[NMS] Add NMS f32 cuda kernel. (#102)

bear-zd · web-flow · commit 2f5740bf3848 · 2024-10-25T14:34:18.000+08:00
diff --git a/nms/README.md b/nms/README.md
@@ -0,0 +1,43 @@
+# NMS
+
+## 0x00 说明
+
+包含以下内容：
+
+- [X] nms_kernel(CPU/GPU)
+- [X] PyTorch bindings
+
+nms cuda实现是最基础的版本，根据[官方源码](https://github.com/pytorch/vision/blob/main/torchvision/csrc/ops/cuda/nms_kernel.cu)可以进行进一步优化。
+
+## 测试
+
+```bash
+# 只测试Ada架构 不指定默认编译所有架构 耗时较长: Volta, Ampere, Ada, Hopper, ...
+export TORCH_CUDA_ARCH_LIST=Ada 
+python3 nms.py
+```
+
+输出:
+
+```bash
+-------------------------------------------------------------------------------------
+                                        nboxes=1024
+       out_nms: ['1021 ', '1022 ', '1023 '], len of keep: 950, time:0.26456594ms
+    out_nms_th: ['1021 ', '1022 ', '1023 '], len of keep: 950, time:0.19218683ms
+-------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------
+                                        nboxes=2048
+       out_nms: ['2045 ', '2046 ', '2047 '], len of keep: 1838, time:0.47256470ms
+    out_nms_th: ['2044 ', '2045 ', '2047 '], len of keep: 1838, time:0.39437532ms
+-------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------
+                                        nboxes=4096
+       out_nms: ['4092 ', '4093 ', '4095 '], len of keep: 3598, time:0.89909315ms
+    out_nms_th: ['4093 ', '4094 ', '4095 '], len of keep: 3598, time:1.03515625ms
+-------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------
+                                        nboxes=8192
+       out_nms: ['8189 ', '8190 ', '8191 '], len of keep: 7023, time:1.49935722ms
+    out_nms_th: ['8189 ', '8190 ', '8191 '], len of keep: 7023, time:3.39094877ms
+-------------------------------------------------------------------------------------
+```
diff --git a/nms/nms.cu b/nms/nms.cu
@@ -1 +1,103 @@
-// TODO: CUDA NMS
+#include <stdio.h>
+#include <stdlib.h>
+#include <float.h>
+#include <vector>
+#include <algorithm>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <torch/types.h>
+#include <torch/extension.h>
+
+#define WARP_SIZE 32
+#define INT4(value) (reinterpret_cast<int4 *>(&(value))[0])
+#define FLOAT4(value) (reinterpret_cast<float4 *>(&(value))[0])
+
+__global__ void nms_kernel(const float *boxes, const float *scores, int *keep, int num_boxes, float iou_threshold) {
+  const int threadsPerBlock = blockDim.x;
+  const int threadId = threadIdx.x;
+  const int blockId = blockIdx.x;
+  const int idx = blockId * threadsPerBlock + threadId;
+
+  if (idx >= num_boxes)
+    return;
+
+  float x1 = boxes[idx * 4 + 0];
+  float y1 = boxes[idx * 4 + 1];
+  float x2 = boxes[idx * 4 + 2];
+  float y2 = boxes[idx * 4 + 3];
+  int suppressed = 0;
+
+  for (int i = 0; i < idx; ++i) {
+    if (keep[i] == 0)
+      continue;
+
+    float x1_i = boxes[i * 4 + 0];
+    float y1_i = boxes[i * 4 + 1];
+    float x2_i = boxes[i * 4 + 2];
+    float y2_i = boxes[i * 4 + 3];
+
+    float inter_x1 = max(x1, x1_i);
+    float inter_y1 = max(y1, y1_i);
+    float inter_x2 = min(x2, x2_i);
+    float inter_y2 = min(y2, y2_i);
+    float inter_w = max(0.0f, inter_x2 - inter_x1);
+    float inter_h = max(0.0f, inter_y2 - inter_y1);
+    float inter_area = inter_w * inter_h;
+
+    float area = (x2 - x1) * (y2 - y1);
+    float area_i = (x2_i - x1_i) * (y2_i - y1_i);
+    float iou = inter_area / (area + area_i - inter_area);
+
+    if (iou > iou_threshold) {
+      keep[idx] = 0;
+      return;
+    }
+  }
+  keep[idx] = 1;
+  return;
+}
+
+// --------------------- PyTorch bindings for custom kernel -----------------------
+#define STRINGFY(str) #str
+#define TORCH_BINDING_COMMON_EXTENSION(func) \
+  m.def(STRINGFY(func), &func, STRINGFY(func));
+
+#define CHECK_TORCH_TENSOR_DTYPE(T, th_type)                   \
+  if (((T).options().dtype() != (th_type))) {                  \
+    std::cout << "Tensor Info:" << (T).options() << std::endl; \
+    throw std::runtime_error("values must be " #th_type);      \
+  }
+
+torch::Tensor nms(torch::Tensor boxes, torch::Tensor scores, float iou_threshold) {
+  CHECK_TORCH_TENSOR_DTYPE(boxes, torch::kFloat32);
+  CHECK_TORCH_TENSOR_DTYPE(scores, torch::kFloat32);
+  const int num_boxes = boxes.size(0);
+  auto toption = torch::TensorOptions().dtype(torch::kInt32).device(boxes.device());
+  auto keep = torch::empty({boxes.size(0)}, toption);
+  dim3 block(WARP_SIZE);
+  dim3 grid((num_boxes + WARP_SIZE - 1) / WARP_SIZE);
+  // sort boxes by scores
+  auto order_t = std::get<1>(
+      scores.sort(/*stable=*/true, /*dim=*/0, /* descending=*/true));
+  auto boxes_sorted = boxes.index_select(0, order_t).contiguous();
+  
+  nms_kernel<<<grid, block>>>(
+      reinterpret_cast<float *>(boxes_sorted.data_ptr()),
+      reinterpret_cast<float *>(scores.data_ptr()),
+      reinterpret_cast<int *>(keep.data_ptr()),
+      num_boxes, iou_threshold);
+  auto keep_cpu = keep.to(torch::kCPU);
+
+  std::vector<int> keep_indices;
+  auto keep_accessor = keep_cpu.accessor<int, 1>();
+  for (int i = 0; i < num_boxes; ++i) {
+    if (keep_accessor[i] == 1) {
+      keep_indices.push_back(i);
+    }
+  }
+  return torch::tensor(keep_indices, torch::TensorOptions().dtype(torch::kInt32));
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  TORCH_BINDING_COMMON_EXTENSION(nms)
+}
diff --git a/nms/nms.py b/nms/nms.py
@@ -0,0 +1,84 @@
+import torch
+import time
+from torch.utils.cpp_extension import load
+from typing import Optional
+from functools import partial
+from torchvision.ops import nms
+torch.set_grad_enabled(False)
+
+# Load the CUDA kernel as a python module
+lib = load(
+    name="nms_lib",
+    sources=["nms.cu"],
+    extra_cuda_cflags=[
+        "-O3",
+        "-U__CUDA_NO_HALF_OPERATORS__",
+        "-U__CUDA_NO_HALF_CONVERSIONS__",
+        "-U__CUDA_NO_HALF2_OPERATORS__",
+        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+        "--expt-relaxed-constexpr",
+        "--expt-extended-lambda",
+        "--use_fast_math",
+    ],
+    extra_cflags=["-std=c++17"],
+)
+
+
+def generate_random_data(Nboxes):
+    boxes = torch.rand(Nboxes, 4)
+    for i in range(Nboxes):
+        if boxes[i, 0] > boxes[i, 2]:
+            boxes[i, 0], boxes[i, 2] = boxes[i, 2], boxes[i, 0]
+        if boxes[i, 1] > boxes[i, 3]:
+            boxes[i, 1], boxes[i, 3] = boxes[i, 3], boxes[i, 1]
+    scores = torch.rand(Nboxes)
+    return boxes, scores
+
+
+def run_benchmark(
+    perf_func: callable,
+    scores: torch.Tensor,
+    boxes: torch.Tensor,
+    thresholds: float,
+    tag: str,
+    warmup: int = 10,
+    iters: int = 100,
+    show_all: bool = False,
+):
+    # warmup
+    for i in range(warmup):
+        out = perf_func(scores, boxes, thresholds)
+    torch.cuda.synchronize()
+
+    start = time.time()
+    # iters
+    for i in range(iters):
+        out = perf_func(scores, boxes, thresholds)
+    torch.cuda.synchronize()
+    end = time.time()
+    total_time = (end - start) * 1000  # ms
+    mean_time = total_time / iters
+    out_info = f"{tag}"
+    out_val = sorted(out.flatten().detach().cpu().numpy().tolist())
+    len_val = len(out_val)
+    out_val = out_val[-min(3, len_val) :]
+    out_val = [f"{v:<5}" for v in out_val]
+    print(f"{out_info:>14}: {out_val}, len of keep: {len_val}, time:{mean_time:.8f}ms")
+    if show_all:
+        print(out)
+    return out, mean_time
+
+
+Nboxes = [1024, 2048, 4096, 8192]
+thresholds = 0.5
+
+
+for nboxes in Nboxes:
+    print("-" * 85)
+    print(" " * 40 + f"nboxes={nboxes}")
+    boxes, scores = generate_random_data(nboxes)
+    boxes = boxes.cuda().float().contiguous()
+    scores = scores.cuda().float().contiguous()
+    run_benchmark(lib.nms, boxes, scores, thresholds, "nms")
+    run_benchmark(nms, boxes, scores, thresholds, "nms_th")
+    print("-" * 85)