Skip to content

Commit e2826fc

Browse files
committed
kdtree_cuda
1 parent edf929f commit e2826fc

File tree

5 files changed

+297
-0
lines changed

5 files changed

+297
-0
lines changed

CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,9 @@ if(BUILD_WITH_CUDA)
236236
src/gtsam_points/cuda/nonlinear_factor_set_gpu_create.cpp
237237
src/gtsam_points/cuda/stream_roundrobin.cu
238238
src/gtsam_points/cuda/stream_temp_buffer_roundrobin.cu
239+
# ann
240+
src/gtsam_points/ann/kdtree_cuda.cpp
241+
src/gtsam_points/ann/kdtree_cuda.cu
239242
# types
240243
src/gtsam_points/types/point_cloud.cu
241244
src/gtsam_points/types/point_cloud_gpu.cu
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
// SPDX-License-Identifier: MIT
2+
// Copyright (c) 2025 Kenji Koide (k.koide@aist.go.jp)
3+
#pragma once
4+
5+
#include <limits>
6+
#include <cstdint>
7+
#include <gtsam_points/types/point_cloud.hpp>
8+
#include <gtsam_points/ann/small_kdtree.hpp>
9+
10+
struct CUstream_st;
11+
12+
namespace gtsam_points {
13+
14+
struct KdTreeNodeGPU {
15+
NodeIndexType left = INVALID_NODE; ///< Left child node index.
16+
NodeIndexType right = INVALID_NODE; ///< Right child node index.
17+
18+
union {
19+
struct Leaf {
20+
NodeIndexType first; ///< First point index in the leaf node.
21+
NodeIndexType last; ///< Last point index in the leaf node.
22+
} lr; ///< Leaf node.
23+
struct NonLeaf {
24+
NodeIndexType axis; ///< Projection axis.
25+
float thresh; ///< Threshold value.
26+
} sub; ///< Non-leaf node.
27+
} node_type;
28+
};
29+
30+
class KdTreeGPU {
31+
public:
32+
KdTreeGPU(const PointCloud::ConstPtr& points, CUstream_st* stream = nullptr);
33+
~KdTreeGPU();
34+
35+
void nearest_neighbor_search(
36+
const Eigen::Vector3f* queries,
37+
size_t num_queries,
38+
std::uint32_t* nn_indices,
39+
float* nn_sq_dists,
40+
CUstream_st* stream = nullptr);
41+
42+
void nearest_neighbor_search_cpu(
43+
const Eigen::Vector3f* h_queries,
44+
size_t num_queries,
45+
std::uint32_t* h_nn_indices,
46+
float* h_nn_sq_dists,
47+
CUstream_st* stream = nullptr);
48+
49+
private:
50+
PointCloud::ConstPtr points;
51+
size_t num_indices;
52+
size_t num_nodes;
53+
std::uint32_t* indices;
54+
KdTreeNodeGPU* nodes;
55+
};
56+
57+
} // namespace gtsam_points
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
// SPDX-License-Identifier: MIT
2+
// Copyright (c) 2025 Kenji Koide (k.koide@aist.go.jp)
3+
#include <gtsam_points/ann/kdtree_cuda.hpp>
4+
5+
#include <gtsam_points/cuda/check_error.cuh>
6+
#include <gtsam_points/ann/small_kdtree.hpp>
7+
8+
namespace gtsam_points {
9+
10+
KdTreeGPU::KdTreeGPU(const PointCloud::ConstPtr& points, CUstream_st* stream)
11+
: points(points),
12+
num_indices(0),
13+
num_nodes(0),
14+
indices(nullptr),
15+
nodes(nullptr) {
16+
//
17+
if (!points->has_points()) {
18+
std::cerr << "error: empty point cloud is given for KdTreeGPU" << std::endl;
19+
return;
20+
}
21+
if (!points->has_points_gpu()) {
22+
std::cerr << "error: point cloud does not have GPU points for KdTreeGPU" << std::endl;
23+
return;
24+
}
25+
26+
//
27+
KdTreeBuilder builder;
28+
UnsafeKdTree<PointCloud> kdtree(*points, builder);
29+
30+
// copy to GPU
31+
std::vector<std::uint32_t> h_indices(kdtree.indices.begin(), kdtree.indices.end());
32+
std::vector<KdTreeNodeGPU> h_nodes(kdtree.nodes.size());
33+
34+
for (int i = 0; i < kdtree.nodes.size(); i++) {
35+
const auto& in = kdtree.nodes[i];
36+
auto& out = h_nodes[i];
37+
38+
out.left = in.left;
39+
out.right = in.right;
40+
41+
if (in.left == INVALID_NODE) {
42+
out.node_type.lr.first = in.node_type.lr.first;
43+
out.node_type.lr.last = in.node_type.lr.last;
44+
} else {
45+
out.node_type.sub.axis = in.node_type.sub.proj.axis;
46+
out.node_type.sub.thresh = in.node_type.sub.thresh;
47+
}
48+
}
49+
50+
num_indices = kdtree.indices.size();
51+
num_nodes = kdtree.nodes.size();
52+
check_error << cudaMallocAsync(&indices, sizeof(std::uint32_t) * num_indices, stream);
53+
check_error << cudaMallocAsync(&nodes, sizeof(KdTreeNodeGPU) * num_nodes, stream);
54+
check_error << cudaMemcpyAsync(indices, h_indices.data(), sizeof(std::uint32_t) * num_indices, cudaMemcpyHostToDevice, stream);
55+
check_error << cudaMemcpyAsync(nodes, h_nodes.data(), sizeof(KdTreeNodeGPU) * num_nodes, cudaMemcpyHostToDevice, stream);
56+
}
57+
58+
KdTreeGPU::~KdTreeGPU() {
59+
check_error << cudaFreeAsync(indices, nullptr);
60+
check_error << cudaFreeAsync(nodes, nullptr);
61+
}
62+
63+
void KdTreeGPU::nearest_neighbor_search_cpu(
64+
const Eigen::Vector3f* h_queries,
65+
size_t num_queries,
66+
std::uint32_t* h_nn_indices,
67+
float* h_nn_sq_dists,
68+
CUstream_st* stream) {
69+
//
70+
Eigen::Vector3f* d_queries;
71+
std::uint32_t* d_nn_indices;
72+
float* d_nn_sq_dists;
73+
74+
check_error << cudaMallocAsync(&d_queries, sizeof(Eigen::Vector3f) * num_queries, stream);
75+
check_error << cudaMallocAsync(&d_nn_indices, sizeof(std::uint32_t) * num_queries, stream);
76+
check_error << cudaMallocAsync(&d_nn_sq_dists, sizeof(float) * num_queries, stream);
77+
check_error << cudaMemcpyAsync(d_queries, h_queries, sizeof(Eigen::Vector3f) * num_queries, cudaMemcpyHostToDevice, stream);
78+
79+
nearest_neighbor_search(d_queries, num_queries, d_nn_indices, d_nn_sq_dists, stream);
80+
81+
check_error << cudaMemcpyAsync(h_nn_indices, d_nn_indices, sizeof(std::uint32_t) * num_queries, cudaMemcpyDeviceToHost, stream);
82+
check_error << cudaMemcpyAsync(h_nn_sq_dists, d_nn_sq_dists, sizeof(float) * num_queries, cudaMemcpyDeviceToHost, stream);
83+
84+
check_error << cudaFreeAsync(d_queries, stream);
85+
check_error << cudaFreeAsync(d_nn_indices, stream);
86+
check_error << cudaFreeAsync(d_nn_sq_dists, stream);
87+
}
88+
89+
} // namespace gtsam_points
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
// SPDX-License-Identifier: MIT
2+
// Copyright (c) 2025 Kenji Koide (k.koide@aist.go.jp)
3+
#include <gtsam_points/ann/kdtree_cuda.hpp>
4+
5+
#include <algorithm>
6+
#include <thrust/copy.h>
7+
8+
#include <thrust/pair.h>
9+
#include <thrust/for_each.h>
10+
#include <thrust/iterator/counting_iterator.h>
11+
#include <gtsam_points/cuda/check_error.cuh>
12+
13+
namespace gtsam_points {
14+
15+
namespace {
16+
17+
struct nearest_neighbor_search_kernel {
18+
public:
19+
static constexpr int MAX_STACK_SIZE = 20;
20+
21+
__device__ void operator()(std::uint32_t i) const {
22+
const Eigen::Vector3f query = queries[i];
23+
24+
thrust::pair<NodeIndexType, float> result = {INVALID_NODE, std::numeric_limits<float>::max()};
25+
26+
int stack_size = 1;
27+
thrust::pair<int, float> search_stack[MAX_STACK_SIZE] = {{0, 0.0f}};
28+
29+
while (stack_size > 0) {
30+
const auto [node_index, sq_dist] = search_stack[--stack_size];
31+
if (sq_dist > result.second) {
32+
continue;
33+
}
34+
35+
const KdTreeNodeGPU node = nodes[node_index];
36+
37+
// Leaf node
38+
if (node.left == INVALID_NODE) {
39+
for (NodeIndexType i = node.node_type.lr.first; i < node.node_type.lr.last; i++) {
40+
const NodeIndexType pt_index = indices[i];
41+
const float sq_dist = (points[pt_index] - query).squaredNorm();
42+
if (sq_dist < result.second) {
43+
result = {pt_index, sq_dist};
44+
}
45+
}
46+
continue;
47+
}
48+
49+
const float val = query[node.node_type.sub.axis];
50+
const float diff = val - node.node_type.sub.thresh;
51+
const float cut_sq_dist = diff * diff;
52+
53+
int best_child;
54+
int other_child;
55+
56+
if (diff < 0.0f) {
57+
best_child = node.left;
58+
other_child = node.right;
59+
} else {
60+
best_child = node.right;
61+
other_child = node.left;
62+
}
63+
64+
if (stack_size > MAX_STACK_SIZE - 2) {
65+
printf("kdtree stack overflow!!");
66+
} else if (cut_sq_dist < result.second) {
67+
search_stack[stack_size].first = other_child;
68+
search_stack[stack_size++].second = cut_sq_dist;
69+
}
70+
71+
search_stack[stack_size].first = best_child;
72+
search_stack[stack_size++].second = 0.0f;
73+
}
74+
75+
nn_indices[i] = result.first;
76+
nn_sq_dists[i] = result.second;
77+
}
78+
79+
public:
80+
const Eigen::Vector3f* __restrict__ points;
81+
const std::uint32_t* __restrict__ indices;
82+
const KdTreeNodeGPU* __restrict__ nodes;
83+
84+
const Eigen::Vector3f* __restrict__ queries;
85+
86+
std::uint32_t* nn_indices;
87+
float* nn_sq_dists;
88+
};
89+
90+
} // namespace
91+
92+
void KdTreeGPU::nearest_neighbor_search(
93+
const Eigen::Vector3f* queries,
94+
const size_t num_queries,
95+
std::uint32_t* nn_indices,
96+
float* nn_sq_dists,
97+
CUstream_st* stream) {
98+
thrust::for_each(
99+
thrust::cuda::par.on(stream),
100+
thrust::counting_iterator<std::uint32_t>(0),
101+
thrust::counting_iterator<std::uint32_t>(num_queries),
102+
nearest_neighbor_search_kernel{points->points_gpu, indices, nodes, queries, nn_indices, nn_sq_dists});
103+
}
104+
105+
} // namespace gtsam_points

src/test/test_kdtree.cpp

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,11 @@
99
#include <gtsam_points/ann/kdtree.hpp>
1010
#include <gtsam_points/ann/kdtree2.hpp>
1111
#include <gtsam_points/ann/kdtreex.hpp>
12+
#include <gtsam_points/ann/kdtree_cuda.hpp>
1213
#include <gtsam_points/types/point_cloud_cpu.hpp>
14+
#include <gtsam_points/types/point_cloud_gpu.hpp>
1315
#include <gtsam_points/util/parallelism.hpp>
16+
#include <gtsam_points/util/easy_profiler.hpp>
1417

1518
class KdTreeTest : public testing::Test, public testing::WithParamInterface<std::string> {
1619
virtual void SetUp() {
@@ -239,6 +242,46 @@ TEST_P(KdTreeTest, RadiusTest) {
239242
}
240243
}
241244

245+
#ifdef GTSAM_POINTS_USE_CUDA
246+
247+
TEST_F(KdTreeTest, KdTreeGPU) {
248+
gtsam_points::KdTree kdtree(points.data(), points.size());
249+
250+
auto points_gpu = std::make_shared<gtsam_points::PointCloudGPU>(points);
251+
gtsam_points::KdTreeGPU kdtree_gpu(points_gpu);
252+
253+
std::vector<Eigen::Vector3f> points_f(points.size());
254+
std::transform(points.begin(), points.end(), points_f.begin(), [](const Eigen::Vector4d& p) { return p.head<3>().cast<float>(); });
255+
256+
std::vector<Eigen::Vector3f> queries_f(queries.size());
257+
std::transform(queries.begin(), queries.end(), queries_f.begin(), [](const Eigen::Vector4d& p) { return p.head<3>().cast<float>(); });
258+
259+
// self-check
260+
std::vector<std::uint32_t> nn_indices(points.size());
261+
std::vector<float> nn_sq_dists(points.size());
262+
kdtree_gpu.nearest_neighbor_search_cpu(points_f.data(), points_f.size(), nn_indices.data(), nn_sq_dists.data());
263+
264+
for (int i = 0; i < points.size(); i++) {
265+
EXPECT_NEAR(nn_sq_dists[i], 0.0, 1e-6);
266+
EXPECT_EQ(nn_indices[i], i);
267+
}
268+
269+
// query check
270+
nn_indices.resize(queries.size());
271+
nn_sq_dists.resize(queries.size());
272+
kdtree_gpu.nearest_neighbor_search_cpu(queries_f.data(), queries_f.size(), nn_indices.data(), nn_sq_dists.data());
273+
274+
for (int i = 0; i < queries.size(); i++) {
275+
EXPECT_NEAR(nn_sq_dists[i], gt_sq_dists[i][0], 1e-2);
276+
277+
const double d1 = (points[nn_indices[i]] - queries[i]).squaredNorm();
278+
const double d2 = (points[gt_indices[i][0]] - queries[i]).squaredNorm();
279+
EXPECT_NEAR(d1, d2, 1e-3);
280+
}
281+
}
282+
283+
#endif
284+
242285
int main(int argc, char** argv) {
243286
testing::InitGoogleTest(&argc, argv);
244287
return RUN_ALL_TESTS();

0 commit comments

Comments
 (0)