Merge pull request #7 from nicolas-chaulet/ball_query_2

nicolas-chaulet · web-flow · commit 01407c786ef5 · 2020-01-09T14:35:39.000Z
Ball query 2
diff --git a/README.md b/README.md
@@ -17,6 +17,12 @@ import torch
 import torch_points.points_cuda
 ```
 
+## Build and test
+```
+python setup.py build_ext --inplace
+python -m unittest
+```
+
 ## Projects using those kernels.
 
 [```Pytorch Point Cloud Benchmark```](https://github.com/nicolas-chaulet/deeppointcloud-benchmarks) by 
diff --git a/cuda/include/ball_query.h b/cuda/include/ball_query.h
@@ -1,5 +1,14 @@
 #pragma once
 #include <torch/extension.h>
 
-at::Tensor ball_query(at::Tensor new_xyz, at::Tensor xyz, const float radius,
-                      const int nsample);
+at::Tensor ball_query_dense(at::Tensor new_xyz, at::Tensor xyz, const float radius,
+			    const int nsample);
+
+std::pair<at::Tensor, at::Tensor> ball_query_partial_dense(at::Tensor x,
+							   at::Tensor y,
+							   at::Tensor batch_x,
+							   at::Tensor batch_y,
+							   const float radius,
+							   const int nsample);
+
+at::Tensor degree(at::Tensor row, int64_t num_nodes);
diff --git a/cuda/include/cuda_utils.h b/cuda/include/cuda_utils.h
@@ -10,7 +10,7 @@
 
 #include <vector>
 
-#define TOTAL_THREADS 512
+#define TOTAL_THREADS 1024
 
 inline int opt_n_threads(int work_size) {
   const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
diff --git a/cuda/src/ball_query.cpp b/cuda/src/ball_query.cpp
@@ -1,12 +1,24 @@
 #include "ball_query.h"
 #include "utils.h"
 
-void query_ball_point_kernel_wrapper(int b, int n, int m, float radius,
-                                     int nsample, const float *new_xyz,
-                                     const float *xyz, int *idx);
+void query_ball_point_kernel_dense_wrapper(int b, int n, int m, float radius,
+					   int nsample, const float *new_xyz,
+					   const float *xyz, int *idx);
 
-at::Tensor ball_query(at::Tensor new_xyz, at::Tensor xyz, const float radius,
-                      const int nsample) {
+void query_ball_point_kernel_partial_wrapper(long batch_size,
+					     int size_x,
+					     int size_y, 
+						 float radius, 
+						 int nsample,
+					     const float *x,
+					     const float *y,
+					     const long *batch_x,
+					     const long *batch_y,
+					     long *idx_out,
+					     float *dist_out);
+
+at::Tensor ball_query_dense(at::Tensor new_xyz, at::Tensor xyz, const float radius,
+			    const int nsample) {
   CHECK_CONTIGUOUS(new_xyz);
   CHECK_CONTIGUOUS(xyz);
   CHECK_IS_FLOAT(new_xyz);
@@ -21,12 +33,71 @@ at::Tensor ball_query(at::Tensor new_xyz, at::Tensor xyz, const float radius,
                    at::device(new_xyz.device()).dtype(at::ScalarType::Int));
 
   if (new_xyz.type().is_cuda()) {
-    query_ball_point_kernel_wrapper(xyz.size(0), xyz.size(1), new_xyz.size(1),
-                                    radius, nsample, new_xyz.data<float>(),
-                                    xyz.data<float>(), idx.data<int>());
+    query_ball_point_kernel_dense_wrapper(xyz.size(0), xyz.size(1), new_xyz.size(1),
+					  radius, nsample, new_xyz.data<float>(),
+					  xyz.data<float>(), idx.data<int>());
   } else {
     AT_CHECK(false, "CPU not supported");
   }
 
   return idx;
 }
+
+at::Tensor degree(at::Tensor row, int64_t num_nodes) {
+	auto zero = at::zeros(num_nodes, row.options());
+	auto one = at::ones(row.size(0), row.options());
+	return zero.scatter_add_(0, row, one);
+  }
+
+std::pair<at::Tensor, at::Tensor> ball_query_partial_dense(at::Tensor x,
+							   at::Tensor y,
+							   at::Tensor batch_x,
+							   at::Tensor batch_y,
+							   const float radius,
+							   const int nsample) {
+	CHECK_CONTIGUOUS(x);
+	CHECK_CONTIGUOUS(y);
+	CHECK_IS_FLOAT(x);
+	CHECK_IS_FLOAT(y);
+
+	if (x.type().is_cuda()) {
+		CHECK_CUDA(x);
+		CHECK_CUDA(y);
+		CHECK_CUDA(batch_x);
+		CHECK_CUDA(batch_y);
+	}
+
+	at::Tensor idx = torch::full({x.size(0), nsample}, y.size(0),
+				      at::device(x.device()).dtype(at::ScalarType::Long));
+	
+	at::Tensor dist = torch::full({x.size(0), nsample}, -1,
+				      at::device(x.device()).dtype(at::ScalarType::Float));
+
+	cudaSetDevice(x.get_device());
+	auto batch_sizes = (int64_t *)malloc(sizeof(int64_t));
+	cudaMemcpy(batch_sizes, batch_x[-1].data<int64_t>(), sizeof(int64_t),
+				cudaMemcpyDeviceToHost);
+	auto batch_size = batch_sizes[0] + 1;
+
+	batch_x = degree(batch_x, batch_size);
+	batch_x = at::cat({at::zeros(1, batch_x.options()), batch_x.cumsum(0)}, 0);
+	batch_y = degree(batch_y, batch_size);
+	batch_y = at::cat({at::zeros(1, batch_y.options()), batch_y.cumsum(0)}, 0);
+
+	if (x.type().is_cuda()) {
+		query_ball_point_kernel_partial_wrapper(batch_size,
+							x.size(0),
+							y.size(0),
+							radius, nsample,
+							x.data<float>(),
+							y.data<float>(),
+							batch_x.data<long>(),
+							batch_y.data<long>(),
+							idx.data<long>(),
+							dist.data<float>());
+	} else {
+	  AT_CHECK(false, "CPU not supported");
+	}
+
+	return std::make_pair(idx, dist);
+}
diff --git a/cuda/src/ball_query_gpu.cu b/cuda/src/ball_query_gpu.cu
@@ -6,15 +6,16 @@
 
 // input: new_xyz(b, m, 3) xyz(b, n, 3)
 // output: idx(b, m, nsample)
-__global__ void query_ball_point_kernel(int b, int n, int m, float radius,
-                                        int nsample,
-                                        const float *__restrict__ new_xyz,
-                                        const float *__restrict__ xyz,
-                                        int *__restrict__ idx) {
+__global__ void query_ball_point_kernel_dense(int b, int n, int m, float radius,
+					      int nsample,
+					      const float *__restrict__ new_xyz,
+					      const float *__restrict__ xyz,
+					      int *__restrict__ idx_out) {
+
   int batch_index = blockIdx.x;
   xyz += batch_index * n * 3;
   new_xyz += batch_index * m * 3;
-  idx += m * nsample * batch_index;
+  idx_out += m * nsample * batch_index;
 
   int index = threadIdx.x;
   int stride = blockDim.x;
@@ -33,22 +34,83 @@ __global__ void query_ball_point_kernel(int b, int n, int m, float radius,
       if (d2 < radius2) {
         if (cnt == 0) {
           for (int l = 0; l < nsample; ++l) {
-            idx[j * nsample + l] = k;
+            idx_out[j * nsample + l] = k;
           }
         }
-        idx[j * nsample + cnt] = k;
+        idx_out[j * nsample + cnt] = k;
         ++cnt;
       }
     }
   }
 }
 
-void query_ball_point_kernel_wrapper(int b, int n, int m, float radius,
-                                     int nsample, const float *new_xyz,
-                                     const float *xyz, int *idx) {
+__global__ void query_ball_point_kernel_partial_dense(int size_x,
+						      int size_y,
+						      float radius,
+						      int nsample,
+						      const float *__restrict__ x,
+						      const float *__restrict__ y,
+						      const long *__restrict__ batch_x,
+						      const long *__restrict__ batch_y,
+						      int64_t *__restrict__ idx_out,
+							  float * __restrict__ dist_out) {
+
+	// taken from https://github.com/rusty1s/pytorch_cluster/blob/master/cuda/radius_kernel.cu
+	const ptrdiff_t batch_idx = blockIdx.x;
+	const ptrdiff_t idx = threadIdx.x;
+
+	const ptrdiff_t start_idx_x = batch_x[batch_idx];
+	const ptrdiff_t end_idx_x = batch_x[batch_idx + 1];
+
+	const ptrdiff_t start_idx_y = batch_y[batch_idx];
+	const ptrdiff_t end_idx_y = batch_y[batch_idx + 1];
+	float radius2 = radius * radius;
+
+	for (ptrdiff_t n_x = start_idx_x + idx; n_x < end_idx_x; n_x += TOTAL_THREADS) {
+		int64_t count = 0;
+		for (ptrdiff_t n_y = start_idx_y; n_y < end_idx_y; n_y++) {
+			float dist = 0;
+			for (ptrdiff_t d = 0; d < 3; d++) {
+				dist += (x[n_x * 3 + d] - y[n_y * 3 + d]) *
+					(x[n_x * 3 + d] - y[n_y * 3 + d]);
+			}
+			if(dist <= radius2){
+				idx_out[n_x * nsample + count] = n_y;
+				dist_out[n_x * nsample + count] = dist;
+				count++;
+		       }
+			if(count >= nsample){
+				break;
+			}
+		}
+	}
+}
+
+void query_ball_point_kernel_dense_wrapper(int b, int n, int m, float radius,
+					   int nsample, const float *new_xyz,
+					   const float *xyz, int *idx) {
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  query_ball_point_kernel<<<b, opt_n_threads(m), 0, stream>>>(
+  query_ball_point_kernel_dense<<<b, opt_n_threads(m), 0, stream>>>(
       b, n, m, radius, nsample, new_xyz, xyz, idx);
 
   CUDA_CHECK_ERRORS();
 }
+
+void query_ball_point_kernel_partial_wrapper(long batch_size,
+						int size_x,
+						int size_y, 
+						float radius, 
+						int nsample,
+						const float *x,
+						const float *y,
+						const long *batch_x,
+						const long *batch_y,
+						int64_t *idx_out,
+						float *dist_out) {
+
+	query_ball_point_kernel_partial_dense<<<batch_size, TOTAL_THREADS>>>(
+		size_x, size_y, radius, nsample, x, y,
+		batch_x, batch_y, idx_out, dist_out);
+
+	CUDA_CHECK_ERRORS();
+}
diff --git a/cuda/src/bindings.cpp b/cuda/src/bindings.cpp
@@ -12,7 +12,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("three_interpolate", &three_interpolate);
   m.def("three_interpolate_grad", &three_interpolate_grad);
 
-  m.def("ball_query", &ball_query);
+  m.def("ball_query_dense", &ball_query_dense);
+  m.def("ball_query_partial_dense", &ball_query_partial_dense);
 
   m.def("group_points", &group_points);
   m.def("group_points_grad", &group_points_grad);
diff --git a/setup.py b/setup.py
@@ -35,7 +35,7 @@
 
 setup(
     name="torch_points",
-    version="0.1.2",
+    version="0.1.3",
     author="Nicolas Chaulet",
     packages=find_packages(),
     install_requires=[],
diff --git a/test/test_ballquerry.py b/test/test_ballquerry.py
@@ -1,6 +1,6 @@
 import unittest
 import torch
-from torch_points import ball_query
+from torch_points import ball_query_dense
 import numpy.testing as npt
 import numpy as np
 
@@ -10,19 +10,17 @@ def test_simple_gpu(self):
         a = torch.tensor([[[0, 0, 0], [1, 0, 0], [2, 0, 0]]]).to(torch.float).cuda()
         b = torch.tensor([[[0, 0, 0]]]).to(torch.float).cuda()
 
-        npt.assert_array_equal(ball_query(1, 2, a, b).detach().cpu().numpy(), np.array([[[0, 0]]]))
+        npt.assert_array_equal(ball_query_dense(1, 2, a, b).detach().cpu().numpy(), np.array([[[0, 0]]]))
 
     def test_simple_cpu(self):
         a = torch.tensor([[[0, 0, 0], [1, 0, 0], [2, 0, 0]]]).to(torch.float)
         b = torch.tensor([[[0, 0, 0]]]).to(torch.float)
-        npt.assert_array_equal(ball_query(1, 2, a, b).detach().numpy(), np.array([[[0, 0]]]))
+        npt.assert_array_equal(ball_query_dense(1, 2, a, b).detach().numpy(), np.array([[[0, 0]]]))
 
     def test_cpu_gpu_equality(self):
         a = torch.randn(5, 1000, 3)
-        npt.assert_array_equal(ball_query(0.1, 17, a, a).detach().numpy(),
-                               ball_query(0.1, 17, a.cuda(), a.cuda()).detach().numpy())
-
-
+        npt.assert_array_equal(ball_query_dense(0.1, 17, a, a).detach().numpy(),
+                               ball_query_dense(0.1, 17, a.cuda(), a.cuda()).cpu().detach().numpy())
 
 
 if __name__ == "__main__":
diff --git a/test/test_ballquerry_partial.py b/test/test_ballquerry_partial.py
@@ -0,0 +1,30 @@
+import unittest
+import torch
+from torch_points import ball_query
+from torch_cluster import radius_cuda
+import numpy.testing as npt
+import numpy as np
+
+class TestBallPartial(unittest.TestCase):
+    def test_simple_gpu(self):
+        x = torch.tensor([[10, 0, 0], [0.1, 0, 0], [10, 0, 0], [0.1, 0, 0]]).to(torch.float).cuda()
+        y = torch.tensor([[0, 0, 0]]).to(torch.float).cuda()
+        batch_x = torch.from_numpy(np.asarray([0, 0, 1, 1])).long().cuda()
+        batch_y = torch.from_numpy(np.asarray([0])).long().cuda()
+        
+        batch_x = torch.from_numpy(np.asarray([0, 0, 1, 1])).long().cuda()
+        batch_y = torch.from_numpy(np.asarray([0])).long().cuda()
+
+        idx, dist2 = ball_query(1., 2, x, y, batch_x, batch_y, mode="PARTIAL_DENSE")
+
+        idx = idx.detach().cpu().numpy()
+        dist2 = dist2.detach().cpu().numpy()
+
+        idx_answer = np.asarray([[1, 1], [0, 1], [1, 1], [1, 1]])
+        dist2_answer = np.asarray([[-1, -1], [0.01, -1], [-1, -1], [-1, -1]]).astype(np.float32)
+
+        npt.assert_array_almost_equal(idx, idx_answer)
+        npt.assert_array_almost_equal(dist2, dist2_answer)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/test_grouping.py b/test/test_grouping.py
diff --git a/torch_points/torchpoints.py b/torch_points/torchpoints.py