kdtree_cuda

koide3 · koide3 · commit e2826fcc5407 · 2025-12-07T08:29:11.000+09:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -236,6 +236,9 @@ if(BUILD_WITH_CUDA)
     src/gtsam_points/cuda/nonlinear_factor_set_gpu_create.cpp
     src/gtsam_points/cuda/stream_roundrobin.cu
     src/gtsam_points/cuda/stream_temp_buffer_roundrobin.cu
+    # ann
+    src/gtsam_points/ann/kdtree_cuda.cpp
+    src/gtsam_points/ann/kdtree_cuda.cu
     # types
     src/gtsam_points/types/point_cloud.cu
     src/gtsam_points/types/point_cloud_gpu.cu
diff --git a/include/gtsam_points/ann/kdtree_cuda.hpp b/include/gtsam_points/ann/kdtree_cuda.hpp
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025  Kenji Koide (k.koide@aist.go.jp)
+#pragma once
+
+#include <limits>
+#include <cstdint>
+#include <gtsam_points/types/point_cloud.hpp>
+#include <gtsam_points/ann/small_kdtree.hpp>
+
+struct CUstream_st;
+
+namespace gtsam_points {
+
+struct KdTreeNodeGPU {
+  NodeIndexType left = INVALID_NODE;   ///< Left child node index.
+  NodeIndexType right = INVALID_NODE;  ///< Right child node index.
+
+  union {
+    struct Leaf {
+      NodeIndexType first;  ///< First point index in the leaf node.
+      NodeIndexType last;   ///< Last point index in the leaf node.
+    } lr;                   ///< Leaf node.
+    struct NonLeaf {
+      NodeIndexType axis;  ///< Projection axis.
+      float thresh;        ///< Threshold value.
+    } sub;                 ///< Non-leaf node.
+  } node_type;
+};
+
+class KdTreeGPU {
+public:
+  KdTreeGPU(const PointCloud::ConstPtr& points, CUstream_st* stream = nullptr);
+  ~KdTreeGPU();
+
+  void nearest_neighbor_search(
+    const Eigen::Vector3f* queries,
+    size_t num_queries,
+    std::uint32_t* nn_indices,
+    float* nn_sq_dists,
+    CUstream_st* stream = nullptr);
+
+  void nearest_neighbor_search_cpu(
+    const Eigen::Vector3f* h_queries,
+    size_t num_queries,
+    std::uint32_t* h_nn_indices,
+    float* h_nn_sq_dists,
+    CUstream_st* stream = nullptr);
+
+private:
+  PointCloud::ConstPtr points;
+  size_t num_indices;
+  size_t num_nodes;
+  std::uint32_t* indices;
+  KdTreeNodeGPU* nodes;
+};
+
+}  // namespace gtsam_points
diff --git a/src/gtsam_points/ann/kdtree_cuda.cpp b/src/gtsam_points/ann/kdtree_cuda.cpp
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025  Kenji Koide (k.koide@aist.go.jp)
+#include <gtsam_points/ann/kdtree_cuda.hpp>
+
+#include <gtsam_points/cuda/check_error.cuh>
+#include <gtsam_points/ann/small_kdtree.hpp>
+
+namespace gtsam_points {
+
+KdTreeGPU::KdTreeGPU(const PointCloud::ConstPtr& points, CUstream_st* stream)
+: points(points),
+  num_indices(0),
+  num_nodes(0),
+  indices(nullptr),
+  nodes(nullptr) {
+  //
+  if (!points->has_points()) {
+    std::cerr << "error: empty point cloud is given for KdTreeGPU" << std::endl;
+    return;
+  }
+  if (!points->has_points_gpu()) {
+    std::cerr << "error: point cloud does not have GPU points for KdTreeGPU" << std::endl;
+    return;
+  }
+
+  //
+  KdTreeBuilder builder;
+  UnsafeKdTree<PointCloud> kdtree(*points, builder);
+
+  // copy to GPU
+  std::vector<std::uint32_t> h_indices(kdtree.indices.begin(), kdtree.indices.end());
+  std::vector<KdTreeNodeGPU> h_nodes(kdtree.nodes.size());
+
+  for (int i = 0; i < kdtree.nodes.size(); i++) {
+    const auto& in = kdtree.nodes[i];
+    auto& out = h_nodes[i];
+
+    out.left = in.left;
+    out.right = in.right;
+
+    if (in.left == INVALID_NODE) {
+      out.node_type.lr.first = in.node_type.lr.first;
+      out.node_type.lr.last = in.node_type.lr.last;
+    } else {
+      out.node_type.sub.axis = in.node_type.sub.proj.axis;
+      out.node_type.sub.thresh = in.node_type.sub.thresh;
+    }
+  }
+
+  num_indices = kdtree.indices.size();
+  num_nodes = kdtree.nodes.size();
+  check_error << cudaMallocAsync(&indices, sizeof(std::uint32_t) * num_indices, stream);
+  check_error << cudaMallocAsync(&nodes, sizeof(KdTreeNodeGPU) * num_nodes, stream);
+  check_error << cudaMemcpyAsync(indices, h_indices.data(), sizeof(std::uint32_t) * num_indices, cudaMemcpyHostToDevice, stream);
+  check_error << cudaMemcpyAsync(nodes, h_nodes.data(), sizeof(KdTreeNodeGPU) * num_nodes, cudaMemcpyHostToDevice, stream);
+}
+
+KdTreeGPU::~KdTreeGPU() {
+  check_error << cudaFreeAsync(indices, nullptr);
+  check_error << cudaFreeAsync(nodes, nullptr);
+}
+
+void KdTreeGPU::nearest_neighbor_search_cpu(
+  const Eigen::Vector3f* h_queries,
+  size_t num_queries,
+  std::uint32_t* h_nn_indices,
+  float* h_nn_sq_dists,
+  CUstream_st* stream) {
+  //
+  Eigen::Vector3f* d_queries;
+  std::uint32_t* d_nn_indices;
+  float* d_nn_sq_dists;
+
+  check_error << cudaMallocAsync(&d_queries, sizeof(Eigen::Vector3f) * num_queries, stream);
+  check_error << cudaMallocAsync(&d_nn_indices, sizeof(std::uint32_t) * num_queries, stream);
+  check_error << cudaMallocAsync(&d_nn_sq_dists, sizeof(float) * num_queries, stream);
+  check_error << cudaMemcpyAsync(d_queries, h_queries, sizeof(Eigen::Vector3f) * num_queries, cudaMemcpyHostToDevice, stream);
+
+  nearest_neighbor_search(d_queries, num_queries, d_nn_indices, d_nn_sq_dists, stream);
+
+  check_error << cudaMemcpyAsync(h_nn_indices, d_nn_indices, sizeof(std::uint32_t) * num_queries, cudaMemcpyDeviceToHost, stream);
+  check_error << cudaMemcpyAsync(h_nn_sq_dists, d_nn_sq_dists, sizeof(float) * num_queries, cudaMemcpyDeviceToHost, stream);
+
+  check_error << cudaFreeAsync(d_queries, stream);
+  check_error << cudaFreeAsync(d_nn_indices, stream);
+  check_error << cudaFreeAsync(d_nn_sq_dists, stream);
+}
+
+}  // namespace gtsam_points
diff --git a/src/gtsam_points/ann/kdtree_cuda.cu b/src/gtsam_points/ann/kdtree_cuda.cu
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025  Kenji Koide (k.koide@aist.go.jp)
+#include <gtsam_points/ann/kdtree_cuda.hpp>
+
+#include <algorithm>
+#include <thrust/copy.h>
+
+#include <thrust/pair.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <gtsam_points/cuda/check_error.cuh>
+
+namespace gtsam_points {
+
+namespace {
+
+struct nearest_neighbor_search_kernel {
+public:
+  static constexpr int MAX_STACK_SIZE = 20;
+
+  __device__ void operator()(std::uint32_t i) const {
+    const Eigen::Vector3f query = queries[i];
+
+    thrust::pair<NodeIndexType, float> result = {INVALID_NODE, std::numeric_limits<float>::max()};
+
+    int stack_size = 1;
+    thrust::pair<int, float> search_stack[MAX_STACK_SIZE] = {{0, 0.0f}};
+
+    while (stack_size > 0) {
+      const auto [node_index, sq_dist] = search_stack[--stack_size];
+      if (sq_dist > result.second) {
+        continue;
+      }
+
+      const KdTreeNodeGPU node = nodes[node_index];
+
+      // Leaf node
+      if (node.left == INVALID_NODE) {
+        for (NodeIndexType i = node.node_type.lr.first; i < node.node_type.lr.last; i++) {
+          const NodeIndexType pt_index = indices[i];
+          const float sq_dist = (points[pt_index] - query).squaredNorm();
+          if (sq_dist < result.second) {
+            result = {pt_index, sq_dist};
+          }
+        }
+        continue;
+      }
+
+      const float val = query[node.node_type.sub.axis];
+      const float diff = val - node.node_type.sub.thresh;
+      const float cut_sq_dist = diff * diff;
+
+      int best_child;
+      int other_child;
+
+      if (diff < 0.0f) {
+        best_child = node.left;
+        other_child = node.right;
+      } else {
+        best_child = node.right;
+        other_child = node.left;
+      }
+
+      if (stack_size > MAX_STACK_SIZE - 2) {
+        printf("kdtree stack overflow!!");
+      } else if (cut_sq_dist < result.second) {
+        search_stack[stack_size].first = other_child;
+        search_stack[stack_size++].second = cut_sq_dist;
+      }
+
+      search_stack[stack_size].first = best_child;
+      search_stack[stack_size++].second = 0.0f;
+    }
+
+    nn_indices[i] = result.first;
+    nn_sq_dists[i] = result.second;
+  }
+
+public:
+  const Eigen::Vector3f* __restrict__ points;
+  const std::uint32_t* __restrict__ indices;
+  const KdTreeNodeGPU* __restrict__ nodes;
+
+  const Eigen::Vector3f* __restrict__ queries;
+
+  std::uint32_t* nn_indices;
+  float* nn_sq_dists;
+};
+
+}  // namespace
+
+void KdTreeGPU::nearest_neighbor_search(
+  const Eigen::Vector3f* queries,
+  const size_t num_queries,
+  std::uint32_t* nn_indices,
+  float* nn_sq_dists,
+  CUstream_st* stream) {
+  thrust::for_each(
+    thrust::cuda::par.on(stream),
+    thrust::counting_iterator<std::uint32_t>(0),
+    thrust::counting_iterator<std::uint32_t>(num_queries),
+    nearest_neighbor_search_kernel{points->points_gpu, indices, nodes, queries, nn_indices, nn_sq_dists});
+}
+
+}  // namespace gtsam_points
diff --git a/src/test/test_kdtree.cpp b/src/test/test_kdtree.cpp
@@ -9,8 +9,11 @@
 #include <gtsam_points/ann/kdtree.hpp>
 #include <gtsam_points/ann/kdtree2.hpp>
 #include <gtsam_points/ann/kdtreex.hpp>
+#include <gtsam_points/ann/kdtree_cuda.hpp>
 #include <gtsam_points/types/point_cloud_cpu.hpp>
+#include <gtsam_points/types/point_cloud_gpu.hpp>
 #include <gtsam_points/util/parallelism.hpp>
+#include <gtsam_points/util/easy_profiler.hpp>
 
 class KdTreeTest : public testing::Test, public testing::WithParamInterface<std::string> {
   virtual void SetUp() {
@@ -239,6 +242,46 @@ TEST_P(KdTreeTest, RadiusTest) {
   }
 }
 
+#ifdef GTSAM_POINTS_USE_CUDA
+
+TEST_F(KdTreeTest, KdTreeGPU) {
+  gtsam_points::KdTree kdtree(points.data(), points.size());
+
+  auto points_gpu = std::make_shared<gtsam_points::PointCloudGPU>(points);
+  gtsam_points::KdTreeGPU kdtree_gpu(points_gpu);
+
+  std::vector<Eigen::Vector3f> points_f(points.size());
+  std::transform(points.begin(), points.end(), points_f.begin(), [](const Eigen::Vector4d& p) { return p.head<3>().cast<float>(); });
+
+  std::vector<Eigen::Vector3f> queries_f(queries.size());
+  std::transform(queries.begin(), queries.end(), queries_f.begin(), [](const Eigen::Vector4d& p) { return p.head<3>().cast<float>(); });
+
+  // self-check
+  std::vector<std::uint32_t> nn_indices(points.size());
+  std::vector<float> nn_sq_dists(points.size());
+  kdtree_gpu.nearest_neighbor_search_cpu(points_f.data(), points_f.size(), nn_indices.data(), nn_sq_dists.data());
+
+  for (int i = 0; i < points.size(); i++) {
+    EXPECT_NEAR(nn_sq_dists[i], 0.0, 1e-6);
+    EXPECT_EQ(nn_indices[i], i);
+  }
+
+  // query check
+  nn_indices.resize(queries.size());
+  nn_sq_dists.resize(queries.size());
+  kdtree_gpu.nearest_neighbor_search_cpu(queries_f.data(), queries_f.size(), nn_indices.data(), nn_sq_dists.data());
+
+  for (int i = 0; i < queries.size(); i++) {
+    EXPECT_NEAR(nn_sq_dists[i], gt_sq_dists[i][0], 1e-2);
+
+    const double d1 = (points[nn_indices[i]] - queries[i]).squaredNorm();
+    const double d2 = (points[gt_indices[i][0]] - queries[i]).squaredNorm();
+    EXPECT_NEAR(d1, d2, 1e-3);
+  }
+}
+
+#endif
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();