Add radius dispatch, CPU and CUDA kernels

akihironitta · akihironitta · commit d1c85627fc58 · 2026-03-16T07:37:12.000Z
CPU kernel uses nanoflann KD-tree radiusSearch. CUDA kernel uses
brute-force pairwise squared Euclidean distance. Supports
max_num_neighbors cap and ignore_same_index. Tests included.
diff --git a/pyg_lib/csrc/ops/cpu/radius_kernel.cpp b/pyg_lib/csrc/ops/cpu/radius_kernel.cpp
@@ -0,0 +1,114 @@
+#include "../radius.h"
+
+#include <ATen/ATen.h>
+#include <torch/library.h>
+
+#include "utils/KDTreeVectorOfVectorsAdaptor.h"
+#include "utils/nanoflann.hpp"
+
+namespace pyg {
+namespace ops {
+
+namespace {
+
+at::Tensor radius_kernel(const at::Tensor& x,
+                         const at::Tensor& y,
+                         const std::optional<at::Tensor>& ptr_x,
+                         const std::optional<at::Tensor>& ptr_y,
+                         double r,
+                         int64_t max_num_neighbors,
+                         int64_t num_workers,
+                         bool ignore_same_index) {
+  std::vector<size_t> out_vec;
+
+  AT_DISPATCH_ALL_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16, x.scalar_type(),
+      "radius_cpu", [&] {
+        auto x_data = x.data_ptr<scalar_t>();
+        auto y_data = y.data_ptr<scalar_t>();
+        typedef std::vector<std::vector<scalar_t>> vec_t;
+        nanoflann::SearchParams params;
+        params.sorted = false;
+
+        if (!ptr_x.has_value()) {
+          vec_t pts(x.size(0));
+          for (int64_t i = 0; i < x.size(0); i++) {
+            pts[i].resize(x.size(1));
+            for (int64_t j = 0; j < x.size(1); j++) {
+              pts[i][j] = x_data[i * x.size(1) + j];
+            }
+          }
+
+          typedef KDTreeVectorOfVectorsAdaptor<vec_t, scalar_t> my_kd_tree_t;
+          my_kd_tree_t mat_index(x.size(1), pts, 10);
+
+          for (int64_t i = 0; i < y.size(0); i++) {
+            std::vector<std::pair<size_t, scalar_t>> ret_matches;
+            size_t num_matches = mat_index.index->radiusSearch(
+                y_data + i * y.size(1), r * r, ret_matches, params);
+
+            for (size_t j = 0, count = 0;
+                 j < num_matches && count < (size_t)max_num_neighbors; j++) {
+              if (!ignore_same_index ||
+                  ret_matches[j].first != static_cast<size_t>(i)) {
+                out_vec.push_back(ret_matches[j].first);
+                out_vec.push_back(i);
+                count++;
+              }
+            }
+          }
+        } else {
+          auto ptr_x_data = ptr_x.value().data_ptr<int64_t>();
+          auto ptr_y_data = ptr_y.value().data_ptr<int64_t>();
+
+          for (int64_t b = 0; b < ptr_x.value().size(0) - 1; b++) {
+            auto x_start = ptr_x_data[b], x_end = ptr_x_data[b + 1];
+            auto y_start = ptr_y_data[b], y_end = ptr_y_data[b + 1];
+
+            if (x_start == x_end || y_start == y_end)
+              continue;
+
+            vec_t pts(x_end - x_start);
+            for (int64_t i = 0; i < x_end - x_start; i++) {
+              pts[i].resize(x.size(1));
+              for (int64_t j = 0; j < x.size(1); j++) {
+                pts[i][j] = x_data[(i + x_start) * x.size(1) + j];
+              }
+            }
+
+            typedef KDTreeVectorOfVectorsAdaptor<vec_t, scalar_t> my_kd_tree_t;
+            my_kd_tree_t mat_index(x.size(1), pts, 10);
+
+            for (int64_t i = y_start; i < y_end; i++) {
+              std::vector<std::pair<size_t, scalar_t>> ret_matches;
+              size_t num_matches = mat_index.index->radiusSearch(
+                  y_data + i * y.size(1), r * r, ret_matches, params);
+
+              for (size_t j = 0, count = 0;
+                   j < num_matches && count < (size_t)max_num_neighbors; j++) {
+                if (!ignore_same_index ||
+                    x_start + static_cast<int64_t>(ret_matches[j].first) != i) {
+                  out_vec.push_back(x_start + ret_matches[j].first);
+                  out_vec.push_back(i);
+                  count++;
+                }
+              }
+            }
+          }
+        }
+      });
+
+  const int64_t size = out_vec.size() / 2;
+  auto out =
+      at::from_blob(out_vec.data(), {size, 2}, x.options().dtype(at::kLong));
+  return out.t().index_select(0, at::tensor({1, 0})).clone();
+}
+
+}  // namespace
+
+TORCH_LIBRARY_IMPL(pyg, CPU, m) {
+  m.impl(TORCH_SELECTIVE_NAME("pyg::radius"), TORCH_FN(radius_kernel));
+}
+
+}  // namespace ops
+}  // namespace pyg
diff --git a/pyg_lib/csrc/ops/cuda/radius_kernel.cu b/pyg_lib/csrc/ops/cuda/radius_kernel.cu
@@ -0,0 +1,110 @@
+#include "../radius.h"
+#include "utils.cuh"
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/library.h>
+
+namespace pyg {
+namespace ops {
+
+namespace {
+
+#define RADIUS_THREADS 256
+
+template <typename scalar_t>
+__global__ void radius_cuda_kernel(const scalar_t* __restrict__ x,
+                                   const scalar_t* __restrict__ y,
+                                   const int64_t* __restrict__ ptr_x,
+                                   const int64_t* __restrict__ ptr_y,
+                                   int64_t* __restrict__ row,
+                                   int64_t* __restrict__ col,
+                                   const scalar_t r,
+                                   const int64_t n,
+                                   const int64_t m,
+                                   const int64_t dim,
+                                   const int64_t num_examples,
+                                   const int64_t max_num_neighbors,
+                                   const bool ignore_same_index) {
+  const int64_t n_y = blockIdx.x * blockDim.x + threadIdx.x;
+  if (n_y >= m)
+    return;
+
+  int64_t count = 0;
+  const int64_t example_idx = get_example_idx(n_y, ptr_y, num_examples);
+
+  for (int64_t n_x = ptr_x[example_idx]; n_x < ptr_x[example_idx + 1]; n_x++) {
+    scalar_t dist = 0;
+    for (int64_t d = 0; d < dim; d++) {
+      scalar_t diff = x[n_x * dim + d] - y[n_y * dim + d];
+      dist += diff * diff;
+    }
+
+    if (scalar_lt(dist, r) && !(ignore_same_index && n_y == n_x)) {
+      row[n_y * max_num_neighbors + count] = n_y;
+      col[n_y * max_num_neighbors + count] = n_x;
+      count++;
+    }
+
+    if (count >= max_num_neighbors)
+      break;
+  }
+}
+
+at::Tensor radius_cuda(const at::Tensor& x,
+                       const at::Tensor& y,
+                       const std::optional<at::Tensor>& ptr_x,
+                       const std::optional<at::Tensor>& ptr_y,
+                       double r,
+                       int64_t max_num_neighbors,
+                       int64_t num_workers,
+                       bool ignore_same_index) {
+  TORCH_CHECK(x.is_cuda() && y.is_cuda(), "Inputs must be CUDA tensors");
+  TORCH_CHECK(x.is_contiguous() && y.is_contiguous(),
+              "Inputs must be contiguous");
+
+  std::optional<at::Tensor> ptr_x_v = ptr_x;
+  std::optional<at::Tensor> ptr_y_v = ptr_y;
+
+  if (!ptr_x_v.has_value())
+    ptr_x_v =
+        at::arange(0, x.size(0) + 1, x.size(0), x.options().dtype(at::kLong));
+  if (!ptr_y_v.has_value())
+    ptr_y_v =
+        at::arange(0, y.size(0) + 1, y.size(0), y.options().dtype(at::kLong));
+
+  TORCH_CHECK(ptr_x_v.value().numel() == ptr_y_v.value().numel(),
+              "ptr_x and ptr_y must have the same number of elements");
+
+  auto row =
+      at::full({y.size(0) * max_num_neighbors}, -1, ptr_y_v.value().options());
+  auto col =
+      at::full({y.size(0) * max_num_neighbors}, -1, ptr_y_v.value().options());
+
+  dim3 BLOCKS((y.size(0) + RADIUS_THREADS - 1) / RADIUS_THREADS);
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16, x.scalar_type(),
+      "radius_cuda", [&] {
+        radius_cuda_kernel<scalar_t><<<BLOCKS, RADIUS_THREADS, 0, stream>>>(
+            x.data_ptr<scalar_t>(), y.data_ptr<scalar_t>(),
+            ptr_x_v.value().data_ptr<int64_t>(),
+            ptr_y_v.value().data_ptr<int64_t>(), row.data_ptr<int64_t>(),
+            col.data_ptr<int64_t>(), (scalar_t)(r * r), x.size(0), y.size(0),
+            x.size(1), ptr_x_v.value().numel() - 1, max_num_neighbors,
+            ignore_same_index);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      });
+
+  auto mask = row != -1;
+  return at::stack({row.masked_select(mask), col.masked_select(mask)}, 0);
+}
+
+}  // namespace
+
+TORCH_LIBRARY_IMPL(pyg, CUDA, m) {
+  m.impl(TORCH_SELECTIVE_NAME("pyg::radius"), TORCH_FN(radius_cuda));
+}
+
+}  // namespace ops
+}  // namespace pyg
diff --git a/pyg_lib/csrc/ops/radius.cpp b/pyg_lib/csrc/ops/radius.cpp
@@ -0,0 +1,45 @@
+#include "radius.h"
+
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <torch/library.h>
+
+namespace pyg {
+namespace ops {
+
+PYG_API at::Tensor radius(const at::Tensor& x,
+                          const at::Tensor& y,
+                          const std::optional<at::Tensor>& ptr_x,
+                          const std::optional<at::Tensor>& ptr_y,
+                          double r,
+                          int64_t max_num_neighbors,
+                          int64_t num_workers,
+                          bool ignore_same_index) {
+  at::TensorArg x_arg{x, "x", 0};
+  at::TensorArg y_arg{y, "y", 1};
+  at::CheckedFrom c{"radius"};
+
+  at::checkAllDefined(c, {x_arg, y_arg});
+  at::checkDim(c, x_arg, 2);
+  at::checkDim(c, y_arg, 2);
+  TORCH_CHECK(x.size(1) == y.size(1), "x and y must have the same feature dim");
+  TORCH_CHECK(r > 0, "r must be positive");
+
+  auto x_c = x.contiguous();
+  auto y_c = y.contiguous();
+
+  static auto op = c10::Dispatcher::singleton()
+                       .findSchemaOrThrow("pyg::radius", "")
+                       .typed<decltype(radius)>();
+  return op.call(x_c, y_c, ptr_x, ptr_y, r, max_num_neighbors, num_workers,
+                 ignore_same_index);
+}
+
+TORCH_LIBRARY_FRAGMENT(pyg, m) {
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "pyg::radius(Tensor x, Tensor y, Tensor? ptr_x=None, "
+      "Tensor? ptr_y=None, float r=1.0, int max_num_neighbors=32, "
+      "int num_workers=1, bool ignore_same_index=False) -> Tensor"));
+}
+
+}  // namespace ops
+}  // namespace pyg
diff --git a/pyg_lib/csrc/ops/radius.h b/pyg_lib/csrc/ops/radius.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include "pyg_lib/csrc/macros.h"
+
+namespace pyg {
+namespace ops {
+
+PYG_API at::Tensor radius(const at::Tensor& x,
+                          const at::Tensor& y,
+                          const std::optional<at::Tensor>& ptr_x,
+                          const std::optional<at::Tensor>& ptr_y,
+                          double r,
+                          int64_t max_num_neighbors,
+                          int64_t num_workers,
+                          bool ignore_same_index);
+
+}  // namespace ops
+}  // namespace pyg
diff --git a/test/ops/test_radius.py b/test/ops/test_radius.py
@@ -0,0 +1,76 @@
+import pytest
+import torch
+
+import pyg_lib
+from pyg_lib.testing import withCUDA
+
+
+@withCUDA
+@pytest.mark.parametrize('dtype', [torch.float, torch.double])
+def test_radius_basic(dtype: torch.dtype, device: torch.device) -> None:
+    x = torch.tensor([[0.0, 0.0], [1.0, 0.0], [2.0, 0.0], [10.0, 0.0]],
+                     dtype=dtype, device=device)
+    y = torch.tensor([[0.5, 0.0]], dtype=dtype, device=device)
+
+    out = pyg_lib.ops.radius(x, y, r=1.5)
+    assert out.shape[0] == 2
+
+    # Points at distance 0.5 and 0.5 should be found (x[0] and x[1])
+    refs = out[1].sort()[0]
+    assert refs.tolist() == [0, 1]
+    assert (out[0] == 0).all()
+
+
+@withCUDA
+@pytest.mark.parametrize('dtype', [torch.float, torch.double])
+def test_radius_correctness(dtype: torch.dtype, device: torch.device) -> None:
+    x = torch.randn(30, 3, dtype=dtype, device=device)
+    y = torch.randn(10, 3, dtype=dtype, device=device)
+    r = 1.5
+
+    out = pyg_lib.ops.radius(x, y, r=r, max_num_neighbors=100)
+
+    # All returned edges should be within radius
+    dists = torch.cdist(y.float(), x.float())
+    for idx in range(out.shape[1]):
+        qi, ri = out[0, idx].item(), out[1, idx].item()
+        assert dists[qi, ri] <= r + 1e-5
+
+
+@withCUDA
+def test_radius_max_num_neighbors(device: torch.device) -> None:
+    x = torch.randn(50, 3, device=device)
+    y = torch.randn(10, 3, device=device)
+
+    out = pyg_lib.ops.radius(x, y, r=100.0, max_num_neighbors=5)
+    # Each query should have at most 5 neighbors
+    for i in range(y.size(0)):
+        assert (out[0] == i).sum() <= 5
+
+
+@withCUDA
+def test_radius_batched(device: torch.device) -> None:
+    x = torch.randn(20, 3, device=device)
+    y = torch.randn(15, 3, device=device)
+    ptr_x = torch.tensor([0, 10, 20], dtype=torch.long, device=device)
+    ptr_y = torch.tensor([0, 8, 15], dtype=torch.long, device=device)
+
+    out = pyg_lib.ops.radius(x, y, r=5.0, ptr_x=ptr_x, ptr_y=ptr_y)
+
+    # Batch 0 queries should only reference batch 0 refs
+    batch0_mask = out[0] < 8
+    assert (out[1, batch0_mask] < 10).all()
+
+    # Batch 1 queries should only reference batch 1 refs
+    batch1_mask = out[0] >= 8
+    assert (out[1, batch1_mask] >= 10).all()
+
+
+@withCUDA
+def test_radius_ignore_same_index(device: torch.device) -> None:
+    x = torch.randn(10, 3, device=device)
+
+    out = pyg_lib.ops.radius(x, x, r=100.0, max_num_neighbors=100,
+                             ignore_same_index=True)
+    # No self-loops
+    assert (out[0] != out[1]).all()