torch-points3d
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CHANGELOG.md‎
Lines changed: 17 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 4 additions & 0 deletions b/‎README.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎cuda/include/chamfer_dist.h‎
Lines changed: 15 additions & 0 deletions b/‎cuda/include/chamfer_dist.h‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎cuda/src/ball_query.cpp‎
Lines changed: 9 additions & 8 deletions b/‎cuda/src/ball_query.cpp‎
Lines changed: 9 additions & 8 deletions
diff --git a/‎cuda/src/ball_query_gpu.cu‎
Lines changed: 18 additions & 14 deletions b/‎cuda/src/ball_query_gpu.cu‎
Lines changed: 18 additions & 14 deletions
diff --git a/‎cuda/src/bindings.cpp‎
Lines changed: 4 additions & 0 deletions b/‎cuda/src/bindings.cpp‎
Lines changed: 4 additions & 0 deletions
@@ -12,7 +12,7 @@ repos:
     rev: stable
     hooks:
       - id: black
-        language_version: python3.6
+        language_version: python3.7
         args: ["--config", ".black.toml"]
   - repo: local
     hooks:
 
@@ -1,3 +1,20 @@
+# 0.6.7
+## Additions
+- Chamfer distance introduced in https://arxiv.org/pdf/1612.00603 for dense batches
+
+# 0.6.6
+## Additions
+- Windows support
+
+
+## Change
+- Develop with python 3.7
+
+## Bug fix
+- Fixed bug in region growing related to batching
+- Ball query for partial dense data on GPU was returning only the first point. Fixed now
+
+
 # 0.6.5
 
 ## Additions
 
@@ -26,6 +26,8 @@ python -m unittest
 ```
 
 ## Troubleshooting
+
+### Compilation issues
 Ensure that at least PyTorch 1.4.0 is installed and verify that `cuda/bin` and `cuda/include` are in your `$PATH` and `$CPATH` respectively, e.g.:
 ```
 $ python -c "import torch; print(torch.__version__)"
@@ -78,3 +80,5 @@ See [this useful chart](http://arnon.dk/matching-sm-architectures-arch-and-genco
 * [```Pointnet2_Tensorflow```](https://github.com/charlesq34/pointnet2) by [Charles R. Qi](https://github.com/charlesq34)
 
 * [```Pointnet2_PyTorch```](https://github.com/erikwijmans/Pointnet2_PyTorch) by [Erik Wijmans](https://github.com/erikwijmans)
+
+* [```GRNet```](https://github.com/hzxie/GRNet) by [Haozhe Xie](https://github.com/hzxie)
@@ -0,0 +1,15 @@
+#include <torch/extension.h>
+#include <vector>
+
+std::vector<torch::Tensor> chamfer_dist(torch::Tensor xyz1, torch::Tensor xyz2);
+
+std::vector<torch::Tensor> chamfer_dist_grad(torch::Tensor xyz1, torch::Tensor xyz2,
+                                             torch::Tensor idx1, torch::Tensor idx2,
+                                             torch::Tensor grad_dist1, torch::Tensor grad_dist2);
+
+std::vector<torch::Tensor> chamfer_dist_kernel_wrapper(torch::Tensor xyz1, torch::Tensor xyz2);
+
+std::vector<torch::Tensor> chamfer_dist_grad_kernel_wrapper(torch::Tensor xyz1, torch::Tensor xyz2,
+                                                            torch::Tensor idx1, torch::Tensor idx2,
+                                                            torch::Tensor grad_dist1,
+                                                            torch::Tensor grad_dist2);
@@ -6,10 +6,11 @@ void query_ball_point_kernel_dense_wrapper(int b, int n, int m, float radius, in
                                            const float* new_xyz, const float* xyz, int64_t* idx,
                                            float* dist_out);
 
-void query_ball_point_kernel_partial_wrapper(int64_t batch_size, int size_x, int size_y, float radius,
-                                             int nsample, const float* x, const float* y,
-                                             const int64_t* batch_x, const int64_t* batch_y,
-                                             int64_t* idx_out, float* dist_out);
+void query_ball_point_kernel_partial_wrapper(int64_t batch_size, int size_x, int size_y,
+                                             float radius, int nsample, const float* x,
+                                             const float* y, const int64_t* batch_x,
+                                             const int64_t* batch_y, int64_t* idx_out,
+                                             float* dist_out);
 
 std::pair<at::Tensor, at::Tensor> ball_query_dense(at::Tensor new_xyz, at::Tensor xyz,
                                                    const float radius, const int nsample)
@@ -71,10 +72,10 @@ std::pair<at::Tensor, at::Tensor> ball_query_partial_dense(at::Tensor x, at::Ten
     batch_y = degree(batch_y, batch_size);
     batch_y = at::cat({at::zeros(1, batch_y.options()), batch_y.cumsum(0)}, 0);
 
-    query_ball_point_kernel_partial_wrapper(batch_size, x.size(0), y.size(0), radius, nsample,
-                                            x.DATA_PTR<float>(), y.DATA_PTR<float>(),
-                                            batch_x.DATA_PTR<int64_t>(), batch_y.DATA_PTR<int64_t>(),
-                                            idx.DATA_PTR<int64_t>(), dist.DATA_PTR<float>());
+    query_ball_point_kernel_partial_wrapper(
+        batch_size, x.size(0), y.size(0), radius, nsample, x.DATA_PTR<float>(), y.DATA_PTR<float>(),
+        batch_x.DATA_PTR<int64_t>(), batch_y.DATA_PTR<int64_t>(), idx.DATA_PTR<int64_t>(),
+        dist.DATA_PTR<float>());
 
     return std::make_pair(idx, dist);
 }
@@ -9,7 +9,7 @@
 __global__ void query_ball_point_kernel_dense(int b, int n, int m, float radius, int nsample,
                                               const float* __restrict__ new_xyz,
                                               const float* __restrict__ xyz,
-                                              int64_t* __restrict__ idx_out, 
+                                              int64_t* __restrict__ idx_out,
                                               float* __restrict__ dist_out)
 {
     int batch_index = blockIdx.x;
@@ -51,15 +51,17 @@ __global__ void query_ball_point_kernel_dense(int b, int n, int m, float radius,
     }
 }
 
-__global__ void query_ball_point_kernel_partial_dense(
-    int size_x, int size_y, float radius, int nsample, const float* __restrict__ x,
-    const float* __restrict__ y, const int64_t* __restrict__ batch_x, const int64_t* __restrict__ batch_y,
-    int64_t* __restrict__ idx_out, float* __restrict__ dist_out)
+__global__ void query_ball_point_kernel_partial_dense(int size_x, int size_y, float radius,
+                                                      int nsample, const float* __restrict__ x,
+                                                      const float* __restrict__ y,
+                                                      const int64_t* __restrict__ batch_x,
+                                                      const int64_t* __restrict__ batch_y,
+                                                      int64_t* __restrict__ idx_out,
+                                                      float* __restrict__ dist_out)
 {
     // taken from
     // https://github.com/rusty1s/pytorch_cluster/blob/master/cuda/radius_kernel.cu
     const ptrdiff_t batch_idx = blockIdx.x;
-    const ptrdiff_t idx = threadIdx.x;
 
     const ptrdiff_t start_idx_x = batch_x[batch_idx];
     const ptrdiff_t end_idx_x = batch_x[batch_idx + 1];
@@ -68,10 +70,10 @@ __global__ void query_ball_point_kernel_partial_dense(
     const ptrdiff_t end_idx_y = batch_y[batch_idx + 1];
     float radius2 = radius * radius;
 
-    for (ptrdiff_t n_x = start_idx_x + idx; n_x < end_idx_x; n_x += TOTAL_THREADS_SPARSE)
+    for (ptrdiff_t n_y = start_idx_y + threadIdx.x; n_y < end_idx_y; n_y += blockDim.x)
     {
         int64_t count = 0;
-        for (ptrdiff_t n_y = start_idx_y; n_y < end_idx_y; n_y++)
+        for (ptrdiff_t n_x = start_idx_x; n_x < end_idx_x; n_x++)
         {
             float dist = 0;
             for (ptrdiff_t d = 0; d < 3; d++)
@@ -93,19 +95,21 @@ __global__ void query_ball_point_kernel_partial_dense(
 }
 
 void query_ball_point_kernel_dense_wrapper(int b, int n, int m, float radius, int nsample,
-                                           const float* new_xyz, const float* xyz, int64_t* idx,float* dist_out)
+                                           const float* new_xyz, const float* xyz, int64_t* idx,
+                                           float* dist_out)
 {
     cudaStream_t stream = at::cuda::getCurrentCUDAStream();
     query_ball_point_kernel_dense<<<b, opt_n_threads(m), 0, stream>>>(b, n, m, radius, nsample,
-                                                                      new_xyz, xyz, idx,dist_out);
+                                                                      new_xyz, xyz, idx, dist_out);
 
     CUDA_CHECK_ERRORS();
 }
 
-void query_ball_point_kernel_partial_wrapper(int64_t batch_size, int size_x, int size_y, float radius,
-                                             int nsample, const float* x, const float* y,
-                                             const int64_t* batch_x, const int64_t* batch_y,
-                                             int64_t* idx_out, float* dist_out)
+void query_ball_point_kernel_partial_wrapper(int64_t batch_size, int size_x, int size_y,
+                                             float radius, int nsample, const float* x,
+                                             const float* y, const int64_t* batch_x,
+                                             const int64_t* batch_y, int64_t* idx_out,
+                                             float* dist_out)
 {
     query_ball_point_kernel_partial_dense<<<batch_size, TOTAL_THREADS_SPARSE>>>(
         size_x, size_y, radius, nsample, x, y, batch_x, batch_y, idx_out, dist_out);
 
@@ -1,4 +1,5 @@
 #include "ball_query.h"
+#include "chamfer_dist.h"
 #include "interpolate.h"
 #include "metrics.h"
 #include "sampling.h"
@@ -15,4 +16,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
     m.def("ball_query_partial_dense", &ball_query_partial_dense);
 
     m.def("instance_iou_cuda", &instance_iou_cuda);
+
+    m.def("chamfer_dist", &chamfer_dist);
+    m.def("chamfer_dist_grad", &chamfer_dist_grad);
 }