Knn interpolate on CPU

nicolas-chaulet · nicolas-chaulet · commit 340677a91986 · 2020-02-27T16:11:33.000Z
diff --git a/cpu/include/interpolate.h b/cpu/include/interpolate.h
@@ -0,0 +1,7 @@
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor knn_interpolate(at::Tensor features, at::Tensor idx, at::Tensor weight);
+
+at::Tensor knn_interpolate_grad(at::Tensor grad_out, at::Tensor idx, at::Tensor weight,
+                                const int m);
diff --git a/cpu/include/knn.h b/cpu/include/knn.h
@@ -1,3 +1,3 @@
 #pragma once
 #include <torch/extension.h>
-std::pair<at::Tensor, at::Tensor> dense_knn(at::Tensor query, at::Tensor support, int k);
+std::pair<at::Tensor, at::Tensor> dense_knn(at::Tensor support, at::Tensor query, int k);
diff --git a/cpu/include/utils.h b/cpu/include/utils.h
@@ -3,4 +3,4 @@
 
 #define CHECK_CPU(x) AT_ASSERTM(!x.type().is_cuda(), #x " must be a CPU tensor")
 
-#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be a contiguous tensor")
+#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be a contiguous tensor")
diff --git a/cpu/src/bindings.cpp b/cpu/src/bindings.cpp
@@ -1,11 +1,16 @@
 #include "ball_query.h"
+// #include "fps.h"
+#include "interpolate.h"
 #include "knn.h"
 
 using namespace pybind11::literals;
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
-    m.def("dense_knn", &dense_knn,"", "support"_a, "querry"_a, "k"_a);
+    m.def("dense_knn", &dense_knn, "", "support"_a, "querry"_a, "k"_a);
+    m.def("knn_interpolate", &knn_interpolate, "", "features"_a, "idx"_a, "weights"_a);
+    m.def("knn_interpolate_grad", &knn_interpolate_grad, "", "grad_out"_a, "idx"_a, "weights"_a,
+          "m"_a);
 
     m.def("ball_query", &ball_query,
           "compute the radius search of a point cloud using nanoflann"
diff --git a/cpu/src/interpolate.cpp b/cpu/src/interpolate.cpp
@@ -0,0 +1,70 @@
+#include "compat.h"
+#include "utils.h"
+#include <iostream>
+#include <torch/extension.h>
+
+at::Tensor knn_interpolate(at::Tensor features, at::Tensor idx, at::Tensor weight)
+{
+    CHECK_CONTIGUOUS(features);
+    CHECK_CONTIGUOUS(idx);
+    CHECK_CONTIGUOUS(weight);
+    CHECK_CPU(idx);
+    CHECK_CPU(features);
+    CHECK_CPU(weight);
+
+    at::Tensor output = torch::zeros({features.size(0), features.size(1), idx.size(1)},
+                                     at::device(features.device()).dtype(features.scalar_type()));
+
+    AT_DISPATCH_ALL_TYPES(features.scalar_type(), "knn_interpolate", [&] {
+        auto output_a = output.accessor<scalar_t, 3>();
+        auto features_a = features.accessor<scalar_t, 3>();
+        auto weight_a = weight.accessor<scalar_t, 3>();
+        auto idx_a = idx.accessor<long, 3>();
+
+        auto batch_size = idx.size(0);
+        for (auto b = 0; b < batch_size; b++)
+        {
+            for (auto p = 0; p < idx.size(1); p++)
+            {
+                for (auto c = 0; c < features.size(1); c++)
+                {
+                    output_a[b][c][p] = 0;
+                    for (int i = 0; i < idx.size(2); i++)
+                        output_a[b][c][p] += features_a[b][c][idx_a[b][p][i]] * weight_a[b][p][i];
+                }
+            }
+        }
+    });
+    return output;
+}
+
+at::Tensor knn_interpolate_grad(at::Tensor grad_out, at::Tensor idx, at::Tensor weight, const int m)
+{
+    CHECK_CPU(grad_out);
+    at::Tensor output = torch::zeros({grad_out.size(0), grad_out.size(1), m},
+                                     at::device(grad_out.device()).dtype(grad_out.scalar_type()));
+
+    AT_DISPATCH_ALL_TYPES(grad_out.scalar_type(), "knn_interpolate_grad", [&] {
+        auto output_a = output.accessor<scalar_t, 3>();
+        auto grad_out_a = grad_out.accessor<scalar_t, 3>();
+        auto weight_a = weight.accessor<scalar_t, 3>();
+        auto idx_a = idx.accessor<long, 3>();
+
+        auto batch_size = idx.size(0);
+        for (auto b = 0; b < batch_size; b++)
+        {
+            for (auto p = 0; p < idx.size(1); p++)
+            {
+                for (auto c = 0; c < grad_out.size(1); c++)
+                {
+                    for (int i = 0; i < idx.size(2); i++)
+                    {
+                        auto new_idx = idx_a[b][p][i];
+                        output_a[b][c][new_idx] += grad_out_a[b][c][p] * weight_a[b][p][i];
+                    }
+                }
+            }
+        }
+    });
+    return output;
+}
diff --git a/cpu/src/knn.cpp b/cpu/src/knn.cpp
@@ -1,27 +1,22 @@
-#include "ball_query.h"
 #include "compat.h"
 #include "neighbors.cpp"
 #include "neighbors.h"
 #include "utils.h"
 #include <iostream>
 #include <torch/extension.h>
 
-
 std::pair<at::Tensor, at::Tensor> _single_batch_knn(at::Tensor support, at::Tensor query, int k)
 {
     CHECK_CONTIGUOUS(support);
     CHECK_CONTIGUOUS(query);
     if (support.size(0) < k)
-        TORCH_CHECK(false, "Not enough points in support to find "+ std::to_string(k) + " neighboors")
-
-    at::Tensor out;
-    at::Tensor out_dists;
-    std::vector<long> neighbors_indices(query.size(0), -1);
-    std::vector<float> neighbors_dists(query.size(0), -1);
+        TORCH_CHECK(false,
+                    "Not enough points in support to find " + std::to_string(k) + " neighboors")
+    std::vector<long> neighbors_indices(query.size(0) * k, -1);
+    std::vector<float> neighbors_dists(query.size(0) * k, -1);
 
     auto options = torch::TensorOptions().dtype(torch::kLong).device(torch::kCPU);
-    auto options_dist = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCPU);
-
+    auto options_dist = torch::TensorOptions().dtype(query.scalar_type()).device(torch::kCPU);
     AT_DISPATCH_ALL_TYPES(query.scalar_type(), "knn", [&] {
         auto data_q = query.DATA_PTR<scalar_t>();
         auto data_s = support.DATA_PTR<scalar_t>();
@@ -31,12 +26,13 @@ std::pair<at::Tensor, at::Tensor> _single_batch_knn(at::Tensor support, at::Tens
             std::vector<scalar_t>(data_s, data_s + support.size(0) * support.size(1));
 
         nanoflann_knn_neighbors<scalar_t>(queries_stl, supports_stl, neighbors_indices,
-                                                  neighbors_dists, k);
+                                          neighbors_dists, k);
     });
     auto neighbors_dists_ptr = neighbors_dists.data();
     long* neighbors_indices_ptr = neighbors_indices.data();
-    out = torch::from_blob(neighbors_indices_ptr, {query.size(0), k}, options = options);
-    out_dists = torch::from_blob(neighbors_dists_ptr, {query.size(0), k}, options = options_dist);
+    auto out = torch::from_blob(neighbors_indices_ptr, {query.size(0), k}, options = options);
+    auto out_dists =
+        torch::from_blob(neighbors_dists_ptr, {query.size(0), k}, options = options_dist);
 
     return std::make_pair(out.clone(), out_dists.clone());
 }
@@ -45,6 +41,8 @@ std::pair<at::Tensor, at::Tensor> dense_knn(at::Tensor support, at::Tensor query
 {
     CHECK_CONTIGUOUS(support);
     CHECK_CONTIGUOUS(query);
+    CHECK_CPU(query);
+    CHECK_CPU(support);
 
     int b = query.size(0);
     vector<at::Tensor> batch_idx;
diff --git a/cpu/src/neighbors.cpp b/cpu/src/neighbors.cpp
@@ -128,7 +128,6 @@ int nanoflann_neighbors(vector<scalar_t>& queries, vector<scalar_t>& supports,
     return max_count;
 }
 
-
 template <typename scalar_t>
 int batch_nanoflann_neighbors(vector<scalar_t>& queries, vector<scalar_t>& supports,
                               vector<long>& q_batches, vector<long>& s_batches,
@@ -283,11 +282,10 @@ int batch_nanoflann_neighbors(vector<scalar_t>& queries, vector<scalar_t>& suppo
 
 template <typename scalar_t>
 void nanoflann_knn_neighbors(vector<scalar_t>& queries, vector<scalar_t>& supports,
-                        vector<long>& neighbors_indices, vector<float>& dists, int k)
+                             vector<long>& neighbors_indices, vector<float>& dists, int k)
 {
     // Nanoflann related variables
     // ***************************
-
     // CLoud variable
     PointCloud<scalar_t> pcd;
     pcd.set(supports);
@@ -315,12 +313,10 @@ void nanoflann_knn_neighbors(vector<scalar_t>& queries, vector<scalar_t>& suppor
         // Find neighbors
         scalar_t query_pt[3] = {p0.x, p0.y, p0.z};
         std::vector<size_t> ret_index(k);
-		std::vector<scalar_t> out_dist_sqr(k);
+        std::vector<scalar_t> out_dist_sqr(k);
 
-        const size_t nMatches =
-            index->knnSearch(&query_pt[0], k, &ret_index[0], &out_dist_sqr[0]);
-        
-        for (size_t i=0; i < nMatches; i++)
+        const size_t nMatches = index->knnSearch(&query_pt[0], k, &ret_index[0], &out_dist_sqr[0]);
+        for (size_t i = 0; i < nMatches; i++)
         {
             neighbors_indices[i + current_pos] = ret_index[i];
             dists[i + current_pos] = out_dist_sqr[i];
diff --git a/cuda/include/ball_query.h b/cuda/include/ball_query.h
@@ -1,8 +1,8 @@
 #pragma once
 #include <torch/extension.h>
 
-at::Tensor ball_query_dense(at::Tensor new_xyz, at::Tensor xyz, const float radius,
-                            const int nsample);
+std::pair<at::Tensor, at::Tensor> ball_query_dense(at::Tensor new_xyz, at::Tensor xyz,
+                                                   const float radius, const int nsample);
 
 std::pair<at::Tensor, at::Tensor> ball_query_partial_dense(at::Tensor x, at::Tensor y,
                                                            at::Tensor batch_x, at::Tensor batch_y,
diff --git a/cuda/src/ball_query.cpp b/cuda/src/ball_query.cpp
@@ -3,15 +3,16 @@
 #include "utils.h"
 
 void query_ball_point_kernel_dense_wrapper(int b, int n, int m, float radius, int nsample,
-                                           const float* new_xyz, const float* xyz, int* idx, float* dist_out);
+                                           const float* new_xyz, const float* xyz, int* idx,
+                                           float* dist_out);
 
 void query_ball_point_kernel_partial_wrapper(long batch_size, int size_x, int size_y, float radius,
                                              int nsample, const float* x, const float* y,
                                              const long* batch_x, const long* batch_y,
                                              long* idx_out, float* dist_out);
 
-at::Tensor ball_query_dense(at::Tensor new_xyz, at::Tensor xyz, const float radius,
-                            const int nsample)
+std::pair<at::Tensor, at::Tensor> ball_query_dense(at::Tensor new_xyz, at::Tensor xyz,
+                                                   const float radius, const int nsample)
 {
     CHECK_CONTIGUOUS(new_xyz);
     CHECK_CONTIGUOUS(xyz);
@@ -25,20 +26,19 @@ at::Tensor ball_query_dense(at::Tensor new_xyz, at::Tensor xyz, const float radi
 
     at::Tensor idx = torch::zeros({new_xyz.size(0), new_xyz.size(1), nsample},
                                   at::device(new_xyz.device()).dtype(at::ScalarType::Int));
-    at::Tensor dist =
-        torch::full({new_xyz.size(0), new_xyz.size(1), nsample}, -1, at::device(new_xyz.device()).dtype(at::ScalarType::Float));
+    at::Tensor dist = torch::full({new_xyz.size(0), new_xyz.size(1), nsample}, -1,
+                                  at::device(new_xyz.device()).dtype(at::ScalarType::Float));
 
     if (new_xyz.type().is_cuda())
     {
-        query_ball_point_kernel_dense_wrapper(xyz.size(0), xyz.size(1), new_xyz.size(1), radius,
-                                              nsample, new_xyz.DATA_PTR<float>(),
-                                              xyz.DATA_PTR<float>(), idx.DATA_PTR<int>(), dist.DATA_PTR<int>());
+        query_ball_point_kernel_dense_wrapper(
+            xyz.size(0), xyz.size(1), new_xyz.size(1), radius, nsample, new_xyz.DATA_PTR<float>(),
+            xyz.DATA_PTR<float>(), idx.DATA_PTR<int>(), dist.DATA_PTR<float>());
     }
     else
     {
         TORCH_CHECK(false, "CPU not supported");
     }
-
     return std::make_pair(idx, dist);
 }
 
diff --git a/cuda/src/ball_query_gpu.cu b/cuda/src/ball_query_gpu.cu
@@ -16,6 +16,7 @@ __global__ void query_ball_point_kernel_dense(int b, int n, int m, float radius,
     xyz += batch_index * n * 3;
     new_xyz += batch_index * m * 3;
     idx_out += m * nsample * batch_index;
+    dist_out += m * nsample * batch_index;
 
     int index = threadIdx.x;
     int stride = blockDim.x;
@@ -43,7 +44,7 @@ __global__ void query_ball_point_kernel_dense(int b, int n, int m, float radius,
                     }
                 }
                 idx_out[j * nsample + cnt] = k;
-                dist_out[j * nsample + cnt] = d2
+                dist_out[j * nsample + cnt] = d2;
                 ++cnt;
             }
         }
diff --git a/test/test_ballquerry.py b/test/test_ballquerry.py
@@ -14,25 +14,25 @@ def test_simple_gpu(self):
         a = torch.tensor([[[0, 0, 0], [1, 0, 0], [2, 0, 0]], [[0, 0, 0], [1, 0, 0], [2, 0, 0]]]).to(torch.float).cuda()
         b = torch.tensor([[[0, 0, 0]], [[3, 0, 0]]]).to(torch.float).cuda()
         idx, dist = ball_query(1.01, 2, a, b)
-        torch.testing.assert_allclose(idx.cpu(), torch.tensor([[[0, 1]], [[2, 2]]]))
+        torch.testing.assert_allclose(idx.long().cpu(), torch.tensor([[[0, 1]], [[2, 2]]]))
         torch.testing.assert_allclose(dist.cpu(), torch.tensor([[[0, 1]], [[1, -1]]]).float())
 
     def test_simple_cpu(self):
         a = torch.tensor([[[0, 0, 0], [1, 0, 0], [2, 0, 0]], [[0, 0, 0], [1, 0, 0], [2, 0, 0]]]).to(torch.float)
         b = torch.tensor([[[0, 0, 0]], [[3, 0, 0]]]).to(torch.float)
         idx, dist = ball_query(1.01, 2, a, b)
-        torch.testing.assert_allclose(idx, torch.tensor([[[0, 1]], [[2, 2]]]))
+        torch.testing.assert_allclose(idx.long(), torch.tensor([[[0, 1]], [[2, 2]]]))
         torch.testing.assert_allclose(dist, torch.tensor([[[0, 1]], [[1, -1]]]).float())
 
         a = torch.tensor([[[0, 0, 0], [1, 0, 0], [1, 1, 0]]]).to(torch.float)
         idx, dist = ball_query(1.01, 3, a, a)
-        torch.testing.assert_allclose(idx,torch.tensor([[[0, 1, 0],[1,0,2],[2,1,2]]]))
+        torch.testing.assert_allclose(idx.long(),torch.tensor([[[0, 1, 0],[1,0,2],[2,1,2]]]))
 
     @run_if_cuda
     def test_larger_gpu(self):
         a = torch.randn(32, 4096, 3).to(torch.float).cuda()
         idx,dist = ball_query(1, 64, a, a)
-        self.assertGreaterEqual(idx.min()[0], 0)
+        self.assertGreaterEqual(idx.min(), 0)
 
     @run_if_cuda
     def test_cpu_gpu_equality(self):
diff --git a/test/test_interpolate.py b/test/test_interpolate.py
@@ -1,24 +1,44 @@
 import unittest
 import torch
-from torch_points import three_interpolate_tg, three_interpolate, three_nn
+from torch.autograd import gradcheck
+from torch_points import three_interpolate, three_nn
+
+from . import run_if_cuda
+
 
 class TestInterpolate(unittest.TestCase):
-    def test_cpu(self):
-        pos = torch.randn([16, 100, 3])
-        pos_skip = torch.randn([16, 500, 3])
-        x = torch.randn([16, 30, 100])
+    @run_if_cuda
+    def test_gpu(self):
+        pos = torch.randn([16, 100, 3]).cuda()
+        pos_skip = torch.randn([16, 500, 3]).cuda()
+        x = torch.randn([16, 30, 100], requires_grad=True).cuda()
+
+        dist, idx = three_nn(pos_skip, pos)
+        dist_recip = 1.0 / (dist + 1e-8)
+        norm = torch.sum(dist_recip, dim=2, keepdim=True)
+        weight = dist_recip / norm
+        interpolated_feats = three_interpolate(x, idx, weight)
+
+        dist, idx = three_nn(pos_skip.cpu(), pos.cpu())
+        dist_recip = 1.0 / (dist + 1e-8)
+        norm = torch.sum(dist_recip, dim=2, keepdim=True)
+        weight = dist_recip / norm
+        interpolated_feats_cpu = three_interpolate(x.cpu(), idx, weight)
 
-        # # dense
-        # dist, idx = three_nn(pos_skip, pos)
-        # dist_recip = 1.0 / (dist + 1e-8)
-        # norm = torch.sum(dist_recip, dim=2, keepdim=True)
-        # weight = dist_recip / norm
-        # interpolated_feats = three_interpolate(x, idx, weight)
+        torch.testing.assert_allclose(interpolated_feats_cpu, interpolated_feats.cpu())
 
-        # sparse
-        sp_interpolated = three_interpolate_tg(x,pos,pos_skip)
+    def test_grad(self):
+        b, n, k = (2, 10, 3)
+        pos = torch.randn([b, n, k]).double()
+        pos_skip = torch.randn([b, 2 * n, k]).double()
+        x = torch.randn([b, 30, n], requires_grad=True).double()
+        dist, idx = three_nn(pos_skip, pos)
+        dist_recip = 1.0 / (dist + 1e-8)
+        norm = torch.sum(dist_recip, dim=2, keepdim=True)
+        weight = dist_recip / norm
+        input = (x, idx, weight)
+        test = gradcheck(three_interpolate, input, eps=1e-6, atol=1e-4)
 
-        # torch.testing.assert_allclose(sp_interpolated, interpolated_feats)
 
 if __name__ == "__main__":
-    unittest.main()
+    unittest.main()
diff --git a/test/test_knn.py b/test/test_knn.py
diff --git a/torch_points/__init__.py b/torch_points/__init__.py
diff --git a/torch_points/knn_interpolate.py b/torch_points/knn_interpolate.py
diff --git a/torch_points/torchpoints.py b/torch_points/torchpoints.py

Original file line number	Diff line number	Diff line change
`@@ -3,4 +3,4 @@`
`3`	`3`
`4`	`4`	`#define CHECK_CPU(x) AT_ASSERTM(!x.type().is_cuda(), #x " must be a CPU tensor")`
`5`	`5`
`6`		`-#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be a contiguous tensor")`
	`6`	`+#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be a contiguous tensor")`
Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,7 @@ __global__ void query_ball_point_kernel_dense(int b, int n, int m, float radius,`
`16`	`16`	`xyz += batch_index * n * 3;`
`17`	`17`	`new_xyz += batch_index * m * 3;`
`18`	`18`	`idx_out += m * nsample * batch_index;`
	`19`	`+ dist_out += m * nsample * batch_index;`
`19`	`20`
`20`	`21`	`int index = threadIdx.x;`
`21`	`22`	`int stride = blockDim.x;`
`@@ -43,7 +44,7 @@ __global__ void query_ball_point_kernel_dense(int b, int n, int m, float radius,`
`43`	`44`	`}`
`44`	`45`	`}`
`45`	`46`	`idx_out[j * nsample + cnt] = k;`
`46`		`- dist_out[j * nsample + cnt] = d2`
	`47`	`+ dist_out[j * nsample + cnt] = d2;`
`47`	`48`	`++cnt;`
`48`	`49`	`}`
`49`	`50`	`}`