Add knn dispatch, CPU and CUDA kernels

akihironitta · akihironitta · commit cc6707734890 · 2026-03-10T00:42:06.000Z
CPU kernel uses nanoflann KD-tree for efficient nearest neighbor search.
CUDA kernel uses brute-force pairwise distance with insertion sort top-k,
supporting cosine distance. Python wrapper and tests included.
diff --git a/pyg_lib/csrc/ops/cpu/knn_kernel.cpp b/pyg_lib/csrc/ops/cpu/knn_kernel.cpp
@@ -0,0 +1,103 @@
+#include "../knn.h"
+
+#include <ATen/ATen.h>
+#include <torch/library.h>
+
+#include "utils/KDTreeVectorOfVectorsAdaptor.h"
+#include "utils/nanoflann.hpp"
+
+namespace pyg {
+namespace ops {
+
+namespace {
+
+at::Tensor knn_kernel(const at::Tensor& x,
+                      const at::Tensor& y,
+                      const std::optional<at::Tensor>& ptr_x,
+                      const std::optional<at::Tensor>& ptr_y,
+                      int64_t k,
+                      bool cosine,
+                      int64_t num_workers) {
+  TORCH_CHECK(!cosine, "`cosine` argument not supported on CPU");
+
+  std::vector<size_t> out_vec;
+
+  AT_DISPATCH_ALL_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16, x.scalar_type(),
+      "knn_cpu", [&] {
+        auto x_data = x.data_ptr<scalar_t>();
+        auto y_data = y.data_ptr<scalar_t>();
+        typedef std::vector<std::vector<scalar_t>> vec_t;
+
+        if (!ptr_x.has_value()) {
+          vec_t pts(x.size(0));
+          for (int64_t i = 0; i < x.size(0); i++) {
+            pts[i].resize(x.size(1));
+            for (int64_t j = 0; j < x.size(1); j++) {
+              pts[i][j] = x_data[i * x.size(1) + j];
+            }
+          }
+
+          typedef KDTreeVectorOfVectorsAdaptor<vec_t, scalar_t> my_kd_tree_t;
+          my_kd_tree_t mat_index(x.size(1), pts, 10);
+
+          std::vector<size_t> ret_index(k);
+          std::vector<scalar_t> out_dist_sqr(k);
+          for (int64_t i = 0; i < y.size(0); i++) {
+            size_t num_matches = mat_index.index->knnSearch(
+                y_data + i * y.size(1), k, &ret_index[0], &out_dist_sqr[0]);
+            for (size_t j = 0; j < num_matches; j++) {
+              out_vec.push_back(ret_index[j]);
+              out_vec.push_back(i);
+            }
+          }
+        } else {
+          auto ptr_x_data = ptr_x.value().data_ptr<int64_t>();
+          auto ptr_y_data = ptr_y.value().data_ptr<int64_t>();
+
+          for (int64_t b = 0; b < ptr_x.value().size(0) - 1; b++) {
+            auto x_start = ptr_x_data[b], x_end = ptr_x_data[b + 1];
+            auto y_start = ptr_y_data[b], y_end = ptr_y_data[b + 1];
+
+            if (x_start == x_end || y_start == y_end)
+              continue;
+
+            vec_t pts(x_end - x_start);
+            for (int64_t i = 0; i < x_end - x_start; i++) {
+              pts[i].resize(x.size(1));
+              for (int64_t j = 0; j < x.size(1); j++) {
+                pts[i][j] = x_data[(i + x_start) * x.size(1) + j];
+              }
+            }
+
+            typedef KDTreeVectorOfVectorsAdaptor<vec_t, scalar_t> my_kd_tree_t;
+            my_kd_tree_t mat_index(x.size(1), pts, 10);
+
+            std::vector<size_t> ret_index(k);
+            std::vector<scalar_t> out_dist_sqr(k);
+            for (int64_t i = y_start; i < y_end; i++) {
+              size_t num_matches = mat_index.index->knnSearch(
+                  y_data + i * y.size(1), k, &ret_index[0], &out_dist_sqr[0]);
+              for (size_t j = 0; j < num_matches; j++) {
+                out_vec.push_back(x_start + ret_index[j]);
+                out_vec.push_back(i);
+              }
+            }
+          }
+        }
+      });
+
+  const int64_t size = out_vec.size() / 2;
+  auto out =
+      at::from_blob(out_vec.data(), {size, 2}, x.options().dtype(at::kLong));
+  return out.t().index_select(0, at::tensor({1, 0})).clone();
+}
+
+}  // namespace
+
+TORCH_LIBRARY_IMPL(pyg, CPU, m) {
+  m.impl(TORCH_SELECTIVE_NAME("pyg::knn"), TORCH_FN(knn_kernel));
+}
+
+}  // namespace ops
+}  // namespace pyg
diff --git a/pyg_lib/csrc/ops/cuda/knn_kernel.cu b/pyg_lib/csrc/ops/cuda/knn_kernel.cu
@@ -0,0 +1,153 @@
+#include "../knn.h"
+#include "utils.cuh"
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/library.h>
+
+namespace pyg {
+namespace ops {
+
+namespace {
+
+#define KNN_THREADS 256
+
+template <typename scalar_t>
+struct Cosine {
+  static inline __device__ scalar_t dot(const scalar_t* a,
+                                        const scalar_t* b,
+                                        int64_t n_a,
+                                        int64_t n_b,
+                                        int64_t size) {
+    scalar_t result = 0;
+    for (int64_t i = 0; i < size; i++) {
+      result += a[n_a * size + i] * b[n_b * size + i];
+    }
+    return result;
+  }
+
+  static inline __device__ scalar_t norm(const scalar_t* a,
+                                         int64_t n_a,
+                                         int64_t size) {
+    scalar_t result = 0;
+    for (int64_t i = 0; i < size; i++) {
+      result += a[n_a * size + i] * a[n_a * size + i];
+    }
+    return sqrt(result);
+  }
+};
+
+template <typename scalar_t>
+__global__ void knn_cuda_kernel(const scalar_t* __restrict__ x,
+                                const scalar_t* __restrict__ y,
+                                const int64_t* __restrict__ ptr_x,
+                                const int64_t* __restrict__ ptr_y,
+                                int64_t* __restrict__ row,
+                                int64_t* __restrict__ col,
+                                const int64_t k,
+                                const int64_t n,
+                                const int64_t m,
+                                const int64_t dim,
+                                const int64_t num_examples,
+                                const bool cosine) {
+  const int64_t n_y = blockIdx.x * blockDim.x + threadIdx.x;
+  if (n_y >= m)
+    return;
+
+  const int64_t example_idx = get_example_idx(n_y, ptr_y, num_examples);
+
+  scalar_t best_dist[100];
+  int64_t best_idx[100];
+
+  for (int e = 0; e < k; e++) {
+    best_dist[e] = (scalar_t)1e10;
+    best_idx[e] = -1;
+  }
+
+  for (int64_t n_x = ptr_x[example_idx]; n_x < ptr_x[example_idx + 1]; n_x++) {
+    scalar_t tmp_dist = 0;
+
+    if (cosine) {
+      tmp_dist = Cosine<scalar_t>::dot(x, y, n_x, n_y, dim) /
+                 (Cosine<scalar_t>::norm(x, n_x, dim) *
+                  Cosine<scalar_t>::norm(y, n_y, dim));
+      tmp_dist = (scalar_t)1. - tmp_dist;
+    } else {
+      for (int64_t d = 0; d < dim; d++) {
+        scalar_t diff = x[n_x * dim + d] - y[n_y * dim + d];
+        tmp_dist += diff * diff;
+      }
+    }
+
+    for (int64_t e1 = 0; e1 < k; e1++) {
+      if (scalar_gt(best_dist[e1], tmp_dist)) {
+        for (int64_t e2 = k - 1; e2 > e1; e2--) {
+          best_dist[e2] = best_dist[e2 - 1];
+          best_idx[e2] = best_idx[e2 - 1];
+        }
+        best_dist[e1] = tmp_dist;
+        best_idx[e1] = n_x;
+        break;
+      }
+    }
+  }
+
+  for (int64_t e = 0; e < k; e++) {
+    row[n_y * k + e] = n_y;
+    col[n_y * k + e] = best_idx[e];
+  }
+}
+
+at::Tensor knn_cuda(const at::Tensor& x,
+                    const at::Tensor& y,
+                    const std::optional<at::Tensor>& ptr_x,
+                    const std::optional<at::Tensor>& ptr_y,
+                    int64_t k,
+                    bool cosine,
+                    int64_t num_workers) {
+  TORCH_CHECK(x.is_cuda() && y.is_cuda(), "Inputs must be CUDA tensors");
+  TORCH_CHECK(x.is_contiguous() && y.is_contiguous(),
+              "Inputs must be contiguous");
+  TORCH_CHECK(k <= 100, "`k` must be <= 100");
+
+  std::optional<at::Tensor> ptr_x_v = ptr_x;
+  std::optional<at::Tensor> ptr_y_v = ptr_y;
+
+  if (!ptr_x_v.has_value())
+    ptr_x_v =
+        at::arange(0, x.size(0) + 1, x.size(0), x.options().dtype(at::kLong));
+  if (!ptr_y_v.has_value())
+    ptr_y_v =
+        at::arange(0, y.size(0) + 1, y.size(0), y.options().dtype(at::kLong));
+
+  TORCH_CHECK(ptr_x_v.value().numel() == ptr_y_v.value().numel(),
+              "ptr_x and ptr_y must have the same number of elements");
+
+  auto row = at::empty({y.size(0) * k}, ptr_y_v.value().options());
+  auto col = at::full({y.size(0) * k}, -1, ptr_y_v.value().options());
+
+  dim3 BLOCKS((y.size(0) + KNN_THREADS - 1) / KNN_THREADS);
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND(
+      at::ScalarType::Half, x.scalar_type(), "knn_cuda", [&] {
+        knn_cuda_kernel<scalar_t><<<BLOCKS, KNN_THREADS, 0, stream>>>(
+            x.data_ptr<scalar_t>(), y.data_ptr<scalar_t>(),
+            ptr_x_v.value().data_ptr<int64_t>(),
+            ptr_y_v.value().data_ptr<int64_t>(), row.data_ptr<int64_t>(),
+            col.data_ptr<int64_t>(), k, x.size(0), y.size(0), x.size(1),
+            ptr_x_v.value().numel() - 1, cosine);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      });
+
+  auto mask = col != -1;
+  return at::stack({row.masked_select(mask), col.masked_select(mask)}, 0);
+}
+
+}  // namespace
+
+TORCH_LIBRARY_IMPL(pyg, CUDA, m) {
+  m.impl(TORCH_SELECTIVE_NAME("pyg::knn"), TORCH_FN(knn_cuda));
+}
+
+}  // namespace ops
+}  // namespace pyg
diff --git a/pyg_lib/csrc/ops/knn.cpp b/pyg_lib/csrc/ops/knn.cpp
@@ -0,0 +1,44 @@
+#include "knn.h"
+
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <torch/library.h>
+
+namespace pyg {
+namespace ops {
+
+PYG_API at::Tensor knn(const at::Tensor& x,
+                       const at::Tensor& y,
+                       const std::optional<at::Tensor>& ptr_x,
+                       const std::optional<at::Tensor>& ptr_y,
+                       int64_t k,
+                       bool cosine,
+                       int64_t num_workers) {
+  at::TensorArg x_arg{x, "x", 0};
+  at::TensorArg y_arg{y, "y", 1};
+  at::CheckedFrom c{"knn"};
+
+  at::checkAllDefined(c, {x_arg, y_arg});
+  at::checkDim(c, x_arg, 2);
+  at::checkDim(c, y_arg, 2);
+
+  TORCH_CHECK(x.size(1) == y.size(1), "x and y must have the same feature dim");
+  TORCH_CHECK(k > 0, "k must be positive");
+
+  auto x_c = x.contiguous();
+  auto y_c = y.contiguous();
+
+  static auto op = c10::Dispatcher::singleton()
+                       .findSchemaOrThrow("pyg::knn", "")
+                       .typed<decltype(knn)>();
+  return op.call(x_c, y_c, ptr_x, ptr_y, k, cosine, num_workers);
+}
+
+TORCH_LIBRARY_FRAGMENT(pyg, m) {
+  m.def(
+      TORCH_SELECTIVE_SCHEMA("pyg::knn(Tensor x, Tensor y, Tensor? ptr_x=None, "
+                             "Tensor? ptr_y=None, int k=1, bool cosine=False, "
+                             "int num_workers=1) -> Tensor"));
+}
+
+}  // namespace ops
+}  // namespace pyg
diff --git a/pyg_lib/csrc/ops/knn.h b/pyg_lib/csrc/ops/knn.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include "pyg_lib/csrc/macros.h"
+
+namespace pyg {
+namespace ops {
+
+PYG_API at::Tensor knn(const at::Tensor& x,
+                       const at::Tensor& y,
+                       const std::optional<at::Tensor>& ptr_x,
+                       const std::optional<at::Tensor>& ptr_y,
+                       int64_t k,
+                       bool cosine,
+                       int64_t num_workers);
+
+}  // namespace ops
+}  // namespace pyg
diff --git a/pyg_lib/ops/__init__.py b/pyg_lib/ops/__init__.py
diff --git a/test/ops/test_knn.py b/test/ops/test_knn.py