Create the implementation for Chamfer Distance. [ci skip]

hzxie · hzxie · commit 652c179cb3a6 · 2020-07-07T20:46:27.000+08:00
diff --git a/cuda/include/chamfer_dist.h b/cuda/include/chamfer_dist.h
@@ -0,0 +1,9 @@
+#include <torch/extension.h>
+#include <vector>
+
+std::vector<torch::Tensor> chamfer_dist_kernel_wrapper(torch::Tensor xyz1, torch::Tensor xyz2);
+
+std::vector<torch::Tensor> chamfer_dist_grad_kernel_wrapper(torch::Tensor xyz1, torch::Tensor xyz2,
+                                                            torch::Tensor idx1, torch::Tensor idx2,
+                                                            torch::Tensor grad_dist1,
+                                                            torch::Tensor grad_dist2);
diff --git a/cuda/src/chamfer_dist.cu b/cuda/src/chamfer_dist.cu
@@ -0,0 +1,231 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <torch/extension.h>
+
+#include <vector>
+
+__global__ void chamfer_dist_kernel(int batch_size, int n, const float* xyz1, int m,
+                                    const float* xyz2, float* dist, int* indexes)
+{
+    const int batch = 512;
+    __shared__ float buf[batch * 3];
+    for (int i = blockIdx.x; i < batch_size; i += gridDim.x)
+    {
+        for (int k2 = 0; k2 < m; k2 += batch)
+        {
+            int end_k = min(m, k2 + batch) - k2;
+            for (int j = threadIdx.x; j < end_k * 3; j += blockDim.x)
+            {
+                buf[j] = xyz2[(i * m + k2) * 3 + j];
+            }
+            __syncthreads();
+            for (int j = threadIdx.x + blockIdx.y * blockDim.x; j < n; j += blockDim.x * gridDim.y)
+            {
+                float x1 = xyz1[(i * n + j) * 3 + 0];
+                float y1 = xyz1[(i * n + j) * 3 + 1];
+                float z1 = xyz1[(i * n + j) * 3 + 2];
+                float best_dist = 0;
+                int best_dist_index = 0;
+                int end_ka = end_k - (end_k & 3);
+                if (end_ka == batch)
+                {
+                    for (int k = 0; k < batch; k += 4)
+                    {
+                        {
+                            float x2 = buf[k * 3 + 0] - x1;
+                            float y2 = buf[k * 3 + 1] - y1;
+                            float z2 = buf[k * 3 + 2] - z1;
+                            float dist = x2 * x2 + y2 * y2 + z2 * z2;
+
+                            if (k == 0 || dist < best_dist)
+                            {
+                                best_dist = dist;
+                                best_dist_index = k + k2;
+                            }
+                        }
+                        {
+                            float x2 = buf[k * 3 + 3] - x1;
+                            float y2 = buf[k * 3 + 4] - y1;
+                            float z2 = buf[k * 3 + 5] - z1;
+                            float dist = x2 * x2 + y2 * y2 + z2 * z2;
+                            if (dist < best_dist)
+                            {
+                                best_dist = dist;
+                                best_dist_index = k + k2 + 1;
+                            }
+                        }
+                        {
+                            float x2 = buf[k * 3 + 6] - x1;
+                            float y2 = buf[k * 3 + 7] - y1;
+                            float z2 = buf[k * 3 + 8] - z1;
+                            float dist = x2 * x2 + y2 * y2 + z2 * z2;
+                            if (dist < best_dist)
+                            {
+                                best_dist = dist;
+                                best_dist_index = k + k2 + 2;
+                            }
+                        }
+                        {
+                            float x2 = buf[k * 3 + 9] - x1;
+                            float y2 = buf[k * 3 + 10] - y1;
+                            float z2 = buf[k * 3 + 11] - z1;
+                            float dist = x2 * x2 + y2 * y2 + z2 * z2;
+                            if (dist < best_dist)
+                            {
+                                best_dist = dist;
+                                best_dist_index = k + k2 + 3;
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for (int k = 0; k < end_ka; k += 4)
+                    {
+                        {
+                            float x2 = buf[k * 3 + 0] - x1;
+                            float y2 = buf[k * 3 + 1] - y1;
+                            float z2 = buf[k * 3 + 2] - z1;
+                            float dist = x2 * x2 + y2 * y2 + z2 * z2;
+                            if (k == 0 || dist < best_dist)
+                            {
+                                best_dist = dist;
+                                best_dist_index = k + k2;
+                            }
+                        }
+                        {
+                            float x2 = buf[k * 3 + 3] - x1;
+                            float y2 = buf[k * 3 + 4] - y1;
+                            float z2 = buf[k * 3 + 5] - z1;
+                            float dist = x2 * x2 + y2 * y2 + z2 * z2;
+                            if (dist < best_dist)
+                            {
+                                best_dist = dist;
+                                best_dist_index = k + k2 + 1;
+                            }
+                        }
+                        {
+                            float x2 = buf[k * 3 + 6] - x1;
+                            float y2 = buf[k * 3 + 7] - y1;
+                            float z2 = buf[k * 3 + 8] - z1;
+                            float dist = x2 * x2 + y2 * y2 + z2 * z2;
+                            if (dist < best_dist)
+                            {
+                                best_dist = dist;
+                                best_dist_index = k + k2 + 2;
+                            }
+                        }
+                        {
+                            float x2 = buf[k * 3 + 9] - x1;
+                            float y2 = buf[k * 3 + 10] - y1;
+                            float z2 = buf[k * 3 + 11] - z1;
+                            float dist = x2 * x2 + y2 * y2 + z2 * z2;
+                            if (dist < best_dist)
+                            {
+                                best_dist = dist;
+                                best_dist_index = k + k2 + 3;
+                            }
+                        }
+                    }
+                }
+                for (int k = end_ka; k < end_k; k++)
+                {
+                    float x2 = buf[k * 3 + 0] - x1;
+                    float y2 = buf[k * 3 + 1] - y1;
+                    float z2 = buf[k * 3 + 2] - z1;
+                    float dist = x2 * x2 + y2 * y2 + z2 * z2;
+                    if (k == 0 || dist < best_dist)
+                    {
+                        best_dist = dist;
+                        best_dist_index = k + k2;
+                    }
+                }
+                if (k2 == 0 || dist[(i * n + j)] > best_dist)
+                {
+                    dist[(i * n + j)] = best_dist;
+                    indexes[(i * n + j)] = best_dist_index;
+                }
+            }
+            __syncthreads();
+        }
+    }
+}
+
+std::vector<torch::Tensor> chamfer_dist_kernel_wrapper(torch::Tensor xyz1, torch::Tensor xyz2)
+{
+    const int batch_size = xyz1.size(0);
+    const int n = xyz1.size(1); // num_points point cloud A
+    const int m = xyz2.size(1); // num_points point cloud B
+    torch::Tensor dist1 = torch::zeros({batch_size, n}, torch::CUDA(torch::kFloat));
+    torch::Tensor dist2 = torch::zeros({batch_size, m}, torch::CUDA(torch::kFloat));
+    torch::Tensor idx1 = torch::zeros({batch_size, n}, torch::CUDA(torch::kInt));
+    torch::Tensor idx2 = torch::zeros({batch_size, m}, torch::CUDA(torch::kInt));
+
+    chamfer_dist_kernel<<<dim3(32, 16, 1), 512>>>(batch_size, n, xyz1.data_ptr<float>(), m,
+                                                  xyz2.data_ptr<float>(), dist1.data_ptr<float>(),
+                                                  idx1.data_ptr<int>());
+    chamfer_dist_kernel<<<dim3(32, 16, 1), 512>>>(batch_size, m, xyz2.data_ptr<float>(), n,
+                                                  xyz1.data_ptr<float>(), dist2.data_ptr<float>(),
+                                                  idx2.data_ptr<int>());
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        printf("Error in chamfer_dist_kernel_wrapper: %s\n", cudaGetErrorString(err));
+    }
+    return {dist1, dist2, idx1, idx2};
+}
+
+__global__ void chamfer_dist_grad_kernel(int b, int n, const float* xyz1, int m, const float* xyz2,
+                                         const float* grad_dist1, const int* idx1, float* grad_xyz1,
+                                         float* grad_xyz2)
+{
+    for (int i = blockIdx.x; i < b; i += gridDim.x)
+    {
+        for (int j = threadIdx.x + blockIdx.y * blockDim.x; j < n; j += blockDim.x * gridDim.y)
+        {
+            float x1 = xyz1[(i * n + j) * 3 + 0];
+            float y1 = xyz1[(i * n + j) * 3 + 1];
+            float z1 = xyz1[(i * n + j) * 3 + 2];
+            int j2 = idx1[i * n + j];
+            float x2 = xyz2[(i * m + j2) * 3 + 0];
+            float y2 = xyz2[(i * m + j2) * 3 + 1];
+            float z2 = xyz2[(i * m + j2) * 3 + 2];
+            float g = grad_dist1[i * n + j] * 2;
+            atomicAdd(&(grad_xyz1[(i * n + j) * 3 + 0]), g * (x1 - x2));
+            atomicAdd(&(grad_xyz1[(i * n + j) * 3 + 1]), g * (y1 - y2));
+            atomicAdd(&(grad_xyz1[(i * n + j) * 3 + 2]), g * (z1 - z2));
+            atomicAdd(&(grad_xyz2[(i * m + j2) * 3 + 0]), -(g * (x1 - x2)));
+            atomicAdd(&(grad_xyz2[(i * m + j2) * 3 + 1]), -(g * (y1 - y2)));
+            atomicAdd(&(grad_xyz2[(i * m + j2) * 3 + 2]), -(g * (z1 - z2)));
+        }
+    }
+}
+
+std::vector<torch::Tensor> chamfer_dist_grad_kernel_wrapper(torch::Tensor xyz1, torch::Tensor xyz2,
+                                                            torch::Tensor idx1, torch::Tensor idx2,
+                                                            torch::Tensor grad_dist1,
+                                                            torch::Tensor grad_dist2)
+{
+    const int batch_size = xyz1.size(0);
+    const int n = xyz1.size(1); // num_points point cloud A
+    const int m = xyz2.size(1); // num_points point cloud B
+    torch::Tensor grad_xyz1 = torch::zeros_like(xyz1, torch::CUDA(torch::kFloat));
+    torch::Tensor grad_xyz2 = torch::zeros_like(xyz2, torch::CUDA(torch::kFloat));
+
+    chamfer_dist_grad_kernel<<<dim3(1, 16, 1), 256>>>(
+        batch_size, n, xyz1.data_ptr<float>(), m, xyz2.data_ptr<float>(),
+        grad_dist1.data_ptr<float>(), idx1.data_ptr<int>(), grad_xyz1.data_ptr<float>(),
+        grad_xyz2.data_ptr<float>());
+    chamfer_dist_grad_kernel<<<dim3(1, 16, 1), 256>>>(
+        batch_size, m, xyz2.data_ptr<float>(), n, xyz1.data_ptr<float>(),
+        grad_dist2.data_ptr<float>(), idx2.data_ptr<int>(), grad_xyz2.data_ptr<float>(),
+        grad_xyz1.data_ptr<float>());
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        printf("Error in chamfer_dist_grad_kernel_wrapper: %s\n", cudaGetErrorString(err));
+    }
+    return {grad_xyz1, grad_xyz2};
+}
diff --git a/cuda/src/chamfer_dist_gpu.cpp b/cuda/src/chamfer_dist_gpu.cpp
@@ -0,0 +1,13 @@
+#include "chamfer_dist.h"
+
+std::vector<torch::Tensor> chamfer_dist(torch::Tensor xyz1, torch::Tensor xyz2)
+{
+    return chamfer_dist_kernel_wrapper(xyz1, xyz2);
+}
+
+std::vector<torch::Tensor> chamfer_dist_grad(torch::Tensor xyz1, torch::Tensor xyz2,
+                                             torch::Tensor idx1, torch::Tensor idx2,
+                                             torch::Tensor grad_dist1, torch::Tensor grad_dist2)
+{
+    return chamfer_dist_grad_kernel_wrapper(xyz1, xyz2, idx1, idx2, grad_dist1, grad_dist2);
+}
diff --git a/test/test_chamfer.py b/test/test_chamfer.py
@@ -0,0 +1,24 @@
+import os
+import sys
+import torch
+import unittest
+
+from torch.autograd import gradcheck
+
+ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
+sys.path.insert(0, ROOT)
+
+from torch_points_kernels import ChamferFunction
+
+
+class TestChamferDistance(unittest.TestCase):
+    def test_chamfer_dist(self):
+        x = torch.rand(4, 64, 3).double()
+        y = torch.rand(4, 128, 3).double()
+        x.requires_grad = True
+        y.requires_grad = True
+        test = gradcheck(ChamferFunction.apply, [x.cuda(), y.cuda()])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/torch_points_kernels/__init__.py b/torch_points_kernels/__init__.py
@@ -12,4 +12,5 @@
     "knn",
     "region_grow",
     "instance_iou",
+    "chamfer_dist"
 ]
diff --git a/torch_points_kernels/torchpoints.py b/torch_points_kernels/torchpoints.py
@@ -235,3 +235,31 @@ def ball_query(
         return ball_query_dense(radius, nsample, x, y, sort=sort)
     else:
         raise Exception("unrecognized mode {}".format(mode))
+
+
+class ChamferFunction(Function):
+    @staticmethod
+    def forward(ctx, xyz1, xyz2):
+        dist1, dist2, idx1, idx2 = tpcuda.chamfer_dist(xyz1, xyz2)
+        print(dir(tpcuda))
+        ctx.save_for_backward(xyz1, xyz2, idx1, idx2)
+
+        return dist1, dist2
+
+    @staticmethod
+    def backward(ctx, grad_dist1, grad_dist2):
+        xyz1, xyz2, idx1, idx2 = ctx.saved_tensors
+        grad_xyz1, grad_xyz2 = tpcuda.chamfer_dist_grad(xyz1, xyz2, idx1, idx2, grad_dist1, grad_dist2)
+        return grad_xyz1, grad_xyz2
+
+
+def chamfer_dist(self, xyz1, xyz2, ignore_zeros=False):
+    batch_size = xyz1.size(0)
+    if batch_size == 1 and ignore_zeros:
+        non_zeros1 = torch.sum(xyz1, dim=2).ne(0)
+        non_zeros2 = torch.sum(xyz2, dim=2).ne(0)
+        xyz1 = xyz1[non_zeros1].unsqueeze(dim=0)
+        xyz2 = xyz2[non_zeros2].unsqueeze(dim=0)
+
+    dist1, dist2 = ChamferFunction.apply(xyz1, xyz2)
+    return torch.mean(dist1) + torch.mean(dist2)

Original file line number	Diff line number	Diff line change
`@@ -12,4 +12,5 @@`
`12`	`12`	`"knn",`
`13`	`13`	`"region_grow",`
`14`	`14`	`"instance_iou",`
	`15`	`+ "chamfer_dist"`
`15`	`16`	`]`