Add pybinding for Chamfer Distance.

hzxie · hzxie · commit a50d3f21f9e2 · 2020-07-07T22:09:40.000+08:00
diff --git a/cuda/include/chamfer_dist.h b/cuda/include/chamfer_dist.h
@@ -1,9 +1,15 @@
 #include <torch/extension.h>
 #include <vector>
 
+std::vector<torch::Tensor> chamfer_dist(torch::Tensor xyz1, torch::Tensor xyz2);
+
+std::vector<torch::Tensor> chamfer_dist_grad(torch::Tensor xyz1, torch::Tensor xyz2,
+                                             torch::Tensor idx1, torch::Tensor idx2,
+                                             torch::Tensor grad_dist1, torch::Tensor grad_dist2);
+
 std::vector<torch::Tensor> chamfer_dist_kernel_wrapper(torch::Tensor xyz1, torch::Tensor xyz2);
 
 std::vector<torch::Tensor> chamfer_dist_grad_kernel_wrapper(torch::Tensor xyz1, torch::Tensor xyz2,
                                                             torch::Tensor idx1, torch::Tensor idx2,
                                                             torch::Tensor grad_dist1,
-                                                            torch::Tensor grad_dist2);
+                                                            torch::Tensor grad_dist2);
diff --git a/cuda/src/bindings.cpp b/cuda/src/bindings.cpp
@@ -1,4 +1,5 @@
 #include "ball_query.h"
+#include "chamfer_dist.h"
 #include "interpolate.h"
 #include "metrics.h"
 #include "sampling.h"
@@ -15,4 +16,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
     m.def("ball_query_partial_dense", &ball_query_partial_dense);
 
     m.def("instance_iou_cuda", &instance_iou_cuda);
+
+    m.def("chamfer_dist", &chamfer_dist);
+    m.def("chamfer_dist_grad", &chamfer_dist_grad);
 }
diff --git a/cuda/src/chamfer_dist.cu b/cuda/src/chamfer_dist.cu
@@ -4,11 +4,13 @@
 
 #include <vector>
 
-__global__ void chamfer_dist_kernel(int batch_size, int n, const float* xyz1, int m,
-                                    const float* xyz2, float* dist, int* indexes)
+template <typename scalar_t>
+__global__ void chamfer_dist_kernel(int batch_size, int n, const scalar_t* __restrict__ xyz1, int m,
+                                    const scalar_t* __restrict__ xyz2, scalar_t* __restrict__ dist,
+                                    int* indexes)
 {
     const int batch = 512;
-    __shared__ float buf[batch * 3];
+    __shared__ scalar_t buf[batch * 3];
     for (int i = blockIdx.x; i < batch_size; i += gridDim.x)
     {
         for (int k2 = 0; k2 < m; k2 += batch)
@@ -21,21 +23,21 @@ __global__ void chamfer_dist_kernel(int batch_size, int n, const float* xyz1, in
             __syncthreads();
             for (int j = threadIdx.x + blockIdx.y * blockDim.x; j < n; j += blockDim.x * gridDim.y)
             {
-                float x1 = xyz1[(i * n + j) * 3 + 0];
-                float y1 = xyz1[(i * n + j) * 3 + 1];
-                float z1 = xyz1[(i * n + j) * 3 + 2];
-                float best_dist = 0;
+                scalar_t x1 = xyz1[(i * n + j) * 3 + 0];
+                scalar_t y1 = xyz1[(i * n + j) * 3 + 1];
+                scalar_t z1 = xyz1[(i * n + j) * 3 + 2];
+                scalar_t best_dist = 0;
                 int best_dist_index = 0;
                 int end_ka = end_k - (end_k & 3);
                 if (end_ka == batch)
                 {
                     for (int k = 0; k < batch; k += 4)
                     {
                         {
-                            float x2 = buf[k * 3 + 0] - x1;
-                            float y2 = buf[k * 3 + 1] - y1;
-                            float z2 = buf[k * 3 + 2] - z1;
-                            float dist = x2 * x2 + y2 * y2 + z2 * z2;
+                            scalar_t x2 = buf[k * 3 + 0] - x1;
+                            scalar_t y2 = buf[k * 3 + 1] - y1;
+                            scalar_t z2 = buf[k * 3 + 2] - z1;
+                            scalar_t dist = x2 * x2 + y2 * y2 + z2 * z2;
 
                             if (k == 0 || dist < best_dist)
                             {
@@ -44,32 +46,32 @@ __global__ void chamfer_dist_kernel(int batch_size, int n, const float* xyz1, in
                             }
                         }
                         {
-                            float x2 = buf[k * 3 + 3] - x1;
-                            float y2 = buf[k * 3 + 4] - y1;
-                            float z2 = buf[k * 3 + 5] - z1;
-                            float dist = x2 * x2 + y2 * y2 + z2 * z2;
+                            scalar_t x2 = buf[k * 3 + 3] - x1;
+                            scalar_t y2 = buf[k * 3 + 4] - y1;
+                            scalar_t z2 = buf[k * 3 + 5] - z1;
+                            scalar_t dist = x2 * x2 + y2 * y2 + z2 * z2;
                             if (dist < best_dist)
                             {
                                 best_dist = dist;
                                 best_dist_index = k + k2 + 1;
                             }
                         }
                         {
-                            float x2 = buf[k * 3 + 6] - x1;
-                            float y2 = buf[k * 3 + 7] - y1;
-                            float z2 = buf[k * 3 + 8] - z1;
-                            float dist = x2 * x2 + y2 * y2 + z2 * z2;
+                            scalar_t x2 = buf[k * 3 + 6] - x1;
+                            scalar_t y2 = buf[k * 3 + 7] - y1;
+                            scalar_t z2 = buf[k * 3 + 8] - z1;
+                            scalar_t dist = x2 * x2 + y2 * y2 + z2 * z2;
                             if (dist < best_dist)
                             {
                                 best_dist = dist;
                                 best_dist_index = k + k2 + 2;
                             }
                         }
                         {
-                            float x2 = buf[k * 3 + 9] - x1;
-                            float y2 = buf[k * 3 + 10] - y1;
-                            float z2 = buf[k * 3 + 11] - z1;
-                            float dist = x2 * x2 + y2 * y2 + z2 * z2;
+                            scalar_t x2 = buf[k * 3 + 9] - x1;
+                            scalar_t y2 = buf[k * 3 + 10] - y1;
+                            scalar_t z2 = buf[k * 3 + 11] - z1;
+                            scalar_t dist = x2 * x2 + y2 * y2 + z2 * z2;
                             if (dist < best_dist)
                             {
                                 best_dist = dist;
@@ -83,43 +85,43 @@ __global__ void chamfer_dist_kernel(int batch_size, int n, const float* xyz1, in
                     for (int k = 0; k < end_ka; k += 4)
                     {
                         {
-                            float x2 = buf[k * 3 + 0] - x1;
-                            float y2 = buf[k * 3 + 1] - y1;
-                            float z2 = buf[k * 3 + 2] - z1;
-                            float dist = x2 * x2 + y2 * y2 + z2 * z2;
+                            scalar_t x2 = buf[k * 3 + 0] - x1;
+                            scalar_t y2 = buf[k * 3 + 1] - y1;
+                            scalar_t z2 = buf[k * 3 + 2] - z1;
+                            scalar_t dist = x2 * x2 + y2 * y2 + z2 * z2;
                             if (k == 0 || dist < best_dist)
                             {
                                 best_dist = dist;
                                 best_dist_index = k + k2;
                             }
                         }
                         {
-                            float x2 = buf[k * 3 + 3] - x1;
-                            float y2 = buf[k * 3 + 4] - y1;
-                            float z2 = buf[k * 3 + 5] - z1;
-                            float dist = x2 * x2 + y2 * y2 + z2 * z2;
+                            scalar_t x2 = buf[k * 3 + 3] - x1;
+                            scalar_t y2 = buf[k * 3 + 4] - y1;
+                            scalar_t z2 = buf[k * 3 + 5] - z1;
+                            scalar_t dist = x2 * x2 + y2 * y2 + z2 * z2;
                             if (dist < best_dist)
                             {
                                 best_dist = dist;
                                 best_dist_index = k + k2 + 1;
                             }
                         }
                         {
-                            float x2 = buf[k * 3 + 6] - x1;
-                            float y2 = buf[k * 3 + 7] - y1;
-                            float z2 = buf[k * 3 + 8] - z1;
-                            float dist = x2 * x2 + y2 * y2 + z2 * z2;
+                            scalar_t x2 = buf[k * 3 + 6] - x1;
+                            scalar_t y2 = buf[k * 3 + 7] - y1;
+                            scalar_t z2 = buf[k * 3 + 8] - z1;
+                            scalar_t dist = x2 * x2 + y2 * y2 + z2 * z2;
                             if (dist < best_dist)
                             {
                                 best_dist = dist;
                                 best_dist_index = k + k2 + 2;
                             }
                         }
                         {
-                            float x2 = buf[k * 3 + 9] - x1;
-                            float y2 = buf[k * 3 + 10] - y1;
-                            float z2 = buf[k * 3 + 11] - z1;
-                            float dist = x2 * x2 + y2 * y2 + z2 * z2;
+                            scalar_t x2 = buf[k * 3 + 9] - x1;
+                            scalar_t y2 = buf[k * 3 + 10] - y1;
+                            scalar_t z2 = buf[k * 3 + 11] - z1;
+                            scalar_t dist = x2 * x2 + y2 * y2 + z2 * z2;
                             if (dist < best_dist)
                             {
                                 best_dist = dist;
@@ -130,10 +132,10 @@ __global__ void chamfer_dist_kernel(int batch_size, int n, const float* xyz1, in
                 }
                 for (int k = end_ka; k < end_k; k++)
                 {
-                    float x2 = buf[k * 3 + 0] - x1;
-                    float y2 = buf[k * 3 + 1] - y1;
-                    float z2 = buf[k * 3 + 2] - z1;
-                    float dist = x2 * x2 + y2 * y2 + z2 * z2;
+                    scalar_t x2 = buf[k * 3 + 0] - x1;
+                    scalar_t y2 = buf[k * 3 + 1] - y1;
+                    scalar_t z2 = buf[k * 3 + 2] - z1;
+                    scalar_t dist = x2 * x2 + y2 * y2 + z2 * z2;
                     if (k == 0 || dist < best_dist)
                     {
                         best_dist = dist;
@@ -161,12 +163,16 @@ std::vector<torch::Tensor> chamfer_dist_kernel_wrapper(torch::Tensor xyz1, torch
     torch::Tensor idx1 = torch::zeros({batch_size, n}, torch::CUDA(torch::kInt));
     torch::Tensor idx2 = torch::zeros({batch_size, m}, torch::CUDA(torch::kInt));
 
-    chamfer_dist_kernel<<<dim3(32, 16, 1), 512>>>(batch_size, n, xyz1.data_ptr<float>(), m,
-                                                  xyz2.data_ptr<float>(), dist1.data_ptr<float>(),
-                                                  idx1.data_ptr<int>());
-    chamfer_dist_kernel<<<dim3(32, 16, 1), 512>>>(batch_size, m, xyz2.data_ptr<float>(), n,
-                                                  xyz1.data_ptr<float>(), dist2.data_ptr<float>(),
-                                                  idx2.data_ptr<int>());
+    AT_DISPATCH_FLOATING_TYPES(
+        xyz1.scalar_type(), "chamfer_dist_cuda", ([&] {
+            chamfer_dist_kernel<scalar_t><<<dim3(32, 16, 1), 512>>>(
+                batch_size, n, xyz1.data_ptr<scalar_t>(), m, xyz2.data_ptr<scalar_t>(),
+                dist1.data_ptr<scalar_t>(), idx1.data_ptr<int>());
+
+            chamfer_dist_kernel<scalar_t><<<dim3(32, 16, 1), 512>>>(
+                batch_size, m, xyz2.data_ptr<scalar_t>(), n, xyz1.data_ptr<scalar_t>(),
+                dist2.data_ptr<scalar_t>(), idx2.data_ptr<int>());
+        }));
 
     cudaError_t err = cudaGetLastError();
     if (err != cudaSuccess)
@@ -176,22 +182,25 @@ std::vector<torch::Tensor> chamfer_dist_kernel_wrapper(torch::Tensor xyz1, torch
     return {dist1, dist2, idx1, idx2};
 }
 
-__global__ void chamfer_dist_grad_kernel(int b, int n, const float* xyz1, int m, const float* xyz2,
-                                         const float* grad_dist1, const int* idx1, float* grad_xyz1,
-                                         float* grad_xyz2)
+template <typename scalar_t>
+__global__ void chamfer_dist_grad_kernel(int b, int n, const scalar_t* __restrict__ xyz1, int m,
+                                         const scalar_t* __restrict__ xyz2,
+                                         const scalar_t* __restrict__ grad_dist1, const int* idx1,
+                                         scalar_t* __restrict__ grad_xyz1,
+                                         scalar_t* __restrict__ grad_xyz2)
 {
     for (int i = blockIdx.x; i < b; i += gridDim.x)
     {
         for (int j = threadIdx.x + blockIdx.y * blockDim.x; j < n; j += blockDim.x * gridDim.y)
         {
-            float x1 = xyz1[(i * n + j) * 3 + 0];
-            float y1 = xyz1[(i * n + j) * 3 + 1];
-            float z1 = xyz1[(i * n + j) * 3 + 2];
+            scalar_t x1 = xyz1[(i * n + j) * 3 + 0];
+            scalar_t y1 = xyz1[(i * n + j) * 3 + 1];
+            scalar_t z1 = xyz1[(i * n + j) * 3 + 2];
             int j2 = idx1[i * n + j];
-            float x2 = xyz2[(i * m + j2) * 3 + 0];
-            float y2 = xyz2[(i * m + j2) * 3 + 1];
-            float z2 = xyz2[(i * m + j2) * 3 + 2];
-            float g = grad_dist1[i * n + j] * 2;
+            scalar_t x2 = xyz2[(i * m + j2) * 3 + 0];
+            scalar_t y2 = xyz2[(i * m + j2) * 3 + 1];
+            scalar_t z2 = xyz2[(i * m + j2) * 3 + 2];
+            scalar_t g = grad_dist1[i * n + j] * 2;
             atomicAdd(&(grad_xyz1[(i * n + j) * 3 + 0]), g * (x1 - x2));
             atomicAdd(&(grad_xyz1[(i * n + j) * 3 + 1]), g * (y1 - y2));
             atomicAdd(&(grad_xyz1[(i * n + j) * 3 + 2]), g * (z1 - z2));
@@ -213,14 +222,18 @@ std::vector<torch::Tensor> chamfer_dist_grad_kernel_wrapper(torch::Tensor xyz1,
     torch::Tensor grad_xyz1 = torch::zeros_like(xyz1, torch::CUDA(torch::kFloat));
     torch::Tensor grad_xyz2 = torch::zeros_like(xyz2, torch::CUDA(torch::kFloat));
 
-    chamfer_dist_grad_kernel<<<dim3(1, 16, 1), 256>>>(
-        batch_size, n, xyz1.data_ptr<float>(), m, xyz2.data_ptr<float>(),
-        grad_dist1.data_ptr<float>(), idx1.data_ptr<int>(), grad_xyz1.data_ptr<float>(),
-        grad_xyz2.data_ptr<float>());
-    chamfer_dist_grad_kernel<<<dim3(1, 16, 1), 256>>>(
-        batch_size, m, xyz2.data_ptr<float>(), n, xyz1.data_ptr<float>(),
-        grad_dist2.data_ptr<float>(), idx2.data_ptr<int>(), grad_xyz2.data_ptr<float>(),
-        grad_xyz1.data_ptr<float>());
+    AT_DISPATCH_FLOATING_TYPES(
+        xyz1.scalar_type(), "chamfer_dist_grad_cuda", ([&] {
+            chamfer_dist_grad_kernel<scalar_t><<<dim3(1, 16, 1), 256>>>(
+                batch_size, n, xyz1.data_ptr<scalar_t>(), m, xyz2.data_ptr<scalar_t>(),
+                grad_dist1.data_ptr<scalar_t>(), idx1.data_ptr<int>(),
+                grad_xyz1.data_ptr<scalar_t>(), grad_xyz2.data_ptr<scalar_t>());
+
+            chamfer_dist_grad_kernel<scalar_t><<<dim3(1, 16, 1), 256>>>(
+                batch_size, m, xyz2.data_ptr<scalar_t>(), n, xyz1.data_ptr<scalar_t>(),
+                grad_dist2.data_ptr<scalar_t>(), idx2.data_ptr<int>(),
+                grad_xyz2.data_ptr<scalar_t>(), grad_xyz1.data_ptr<scalar_t>());
+        }));
 
     cudaError_t err = cudaGetLastError();
     if (err != cudaSuccess)
diff --git a/test/test_chamfer_dist.py b/test/test_chamfer_dist.py
diff --git a/torch_points_kernels/torchpoints.py b/torch_points_kernels/torchpoints.py
@@ -241,7 +241,6 @@ class ChamferFunction(Function):
     @staticmethod
     def forward(ctx, xyz1, xyz2):
         dist1, dist2, idx1, idx2 = tpcuda.chamfer_dist(xyz1, xyz2)
-        print(dir(tpcuda))
         ctx.save_for_backward(xyz1, xyz2, idx1, idx2)
 
         return dist1, dist2