Add nearest dispatch, CPU and CUDA kernels

akihironitta · akihironitta · commit 107c05f5d6f4 · 2026-03-16T07:37:12.000Z
New C++ CPU kernel (brute-force pairwise + argmin) replacing the
original scipy fallback. CUDA kernel uses shared-memory argmin
reduction with 1 block per query point. Tests included.
diff --git a/pyg_lib/csrc/ops/cpu/nearest_kernel.cpp b/pyg_lib/csrc/ops/cpu/nearest_kernel.cpp
@@ -0,0 +1,79 @@
+#include "../nearest.h"
+
+#include <ATen/ATen.h>
+#include <torch/library.h>
+
+namespace pyg {
+namespace ops {
+
+namespace {
+
+at::Tensor nearest_kernel(const at::Tensor& x,
+                          const at::Tensor& y,
+                          const std::optional<at::Tensor>& ptr_x,
+                          const std::optional<at::Tensor>& ptr_y) {
+  auto out = at::empty({x.size(0)}, x.options().dtype(at::kLong));
+
+  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "nearest_cpu", [&] {
+    auto x_data = x.data_ptr<scalar_t>();
+    auto y_data = y.data_ptr<scalar_t>();
+    auto out_data = out.data_ptr<int64_t>();
+    auto dim = x.size(1);
+
+    if (!ptr_x.has_value()) {
+      for (int64_t i = 0; i < x.size(0); i++) {
+        scalar_t best_dist = std::numeric_limits<scalar_t>::max();
+        int64_t best_idx = 0;
+        for (int64_t j = 0; j < y.size(0); j++) {
+          scalar_t dist = 0;
+          for (int64_t d = 0; d < dim; d++) {
+            scalar_t diff = x_data[i * dim + d] - y_data[j * dim + d];
+            dist += diff * diff;
+          }
+          if (dist < best_dist) {
+            best_dist = dist;
+            best_idx = j;
+          }
+        }
+        out_data[i] = best_idx;
+      }
+    } else {
+      auto ptr_x_data = ptr_x.value().data_ptr<int64_t>();
+      auto ptr_y_data = ptr_y.value().data_ptr<int64_t>();
+      auto num_batches = ptr_x.value().size(0) - 1;
+
+      for (int64_t b = 0; b < num_batches; b++) {
+        auto x_start = ptr_x_data[b], x_end = ptr_x_data[b + 1];
+        auto y_start = ptr_y_data[b], y_end = ptr_y_data[b + 1];
+
+        for (int64_t i = x_start; i < x_end; i++) {
+          scalar_t best_dist = std::numeric_limits<scalar_t>::max();
+          int64_t best_idx = y_start;
+          for (int64_t j = y_start; j < y_end; j++) {
+            scalar_t dist = 0;
+            for (int64_t d = 0; d < dim; d++) {
+              scalar_t diff = x_data[i * dim + d] - y_data[j * dim + d];
+              dist += diff * diff;
+            }
+            if (dist < best_dist) {
+              best_dist = dist;
+              best_idx = j;
+            }
+          }
+          out_data[i] = best_idx;
+        }
+      }
+    }
+  });
+
+  return out;
+}
+
+}  // namespace
+
+TORCH_LIBRARY_IMPL(pyg, CPU, m) {
+  m.impl(TORCH_SELECTIVE_NAME("pyg::nearest"), TORCH_FN(nearest_kernel));
+}
+
+}  // namespace ops
+}  // namespace pyg
diff --git a/pyg_lib/csrc/ops/cuda/nearest_kernel.cu b/pyg_lib/csrc/ops/cuda/nearest_kernel.cu
@@ -0,0 +1,117 @@
+#include "../nearest.h"
+#include "utils.cuh"
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/library.h>
+
+namespace pyg {
+namespace ops {
+
+namespace {
+
+#define NEAREST_THREADS 1024
+
+template <typename scalar_t>
+__global__ void nearest_cuda_kernel(const scalar_t* __restrict__ x,
+                                    const scalar_t* __restrict__ y,
+                                    const int64_t* __restrict__ ptr_x,
+                                    const int64_t* __restrict__ ptr_y,
+                                    int64_t* __restrict__ out,
+                                    int64_t batch_size,
+                                    int64_t dim) {
+  const int64_t thread_idx = threadIdx.x;
+  const int64_t n_x = blockIdx.x;
+
+  int64_t batch_idx = 0;
+  for (int64_t b = 0; b < batch_size; b++) {
+    if (n_x >= ptr_x[b] && n_x < ptr_x[b + 1]) {
+      batch_idx = b;
+      break;
+    }
+  }
+
+  const int64_t y_start = ptr_y[batch_idx];
+  const int64_t y_end = ptr_y[batch_idx + 1];
+
+  __shared__ scalar_t best_dist[NEAREST_THREADS];
+  __shared__ int64_t best_dist_idx[NEAREST_THREADS];
+
+  scalar_t best = (scalar_t)1e38;
+  int64_t best_idx = y_start;
+  for (int64_t n_y = y_start + thread_idx; n_y < y_end;
+       n_y += NEAREST_THREADS) {
+    scalar_t dist = 0;
+    for (int64_t d = 0; d < dim; d++) {
+      scalar_t diff = x[n_x * dim + d] - y[n_y * dim + d];
+      dist += diff * diff;
+    }
+
+    if (scalar_lt(dist, best)) {
+      best = dist;
+      best_idx = n_y;
+    }
+  }
+
+  best_dist[thread_idx] = best;
+  best_dist_idx[thread_idx] = best_idx;
+
+  for (int64_t u = 0; (1 << u) < NEAREST_THREADS; u++) {
+    __syncthreads();
+    if (thread_idx < (NEAREST_THREADS >> (u + 1))) {
+      int64_t idx_1 = (thread_idx * 2) << u;
+      int64_t idx_2 = (thread_idx * 2 + 1) << u;
+      if (scalar_gt(best_dist[idx_1], best_dist[idx_2])) {
+        best_dist[idx_1] = best_dist[idx_2];
+        best_dist_idx[idx_1] = best_dist_idx[idx_2];
+      }
+    }
+  }
+
+  __syncthreads();
+  if (thread_idx == 0) {
+    out[n_x] = best_dist_idx[0];
+  }
+}
+
+at::Tensor nearest_cuda(const at::Tensor& x,
+                        const at::Tensor& y,
+                        const std::optional<at::Tensor>& ptr_x,
+                        const std::optional<at::Tensor>& ptr_y) {
+  TORCH_CHECK(x.is_cuda() && y.is_cuda(), "Inputs must be CUDA tensors");
+  TORCH_CHECK(x.is_contiguous() && y.is_contiguous(),
+              "Inputs must be contiguous");
+
+  std::optional<at::Tensor> ptr_x_v = ptr_x;
+  std::optional<at::Tensor> ptr_y_v = ptr_y;
+
+  if (!ptr_x_v.has_value())
+    ptr_x_v =
+        at::arange(0, x.size(0) + 1, x.size(0), x.options().dtype(at::kLong));
+  if (!ptr_y_v.has_value())
+    ptr_y_v =
+        at::arange(0, y.size(0) + 1, y.size(0), y.options().dtype(at::kLong));
+
+  auto out = at::empty({x.size(0)}, ptr_x_v.value().options());
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "nearest_cuda", [&] {
+    nearest_cuda_kernel<scalar_t><<<x.size(0), NEAREST_THREADS, 0, stream>>>(
+        x.data_ptr<scalar_t>(), y.data_ptr<scalar_t>(),
+        ptr_x_v.value().data_ptr<int64_t>(),
+        ptr_y_v.value().data_ptr<int64_t>(), out.data_ptr<int64_t>(),
+        ptr_x_v.value().size(0) - 1, x.size(1));
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  });
+
+  return out;
+}
+
+}  // namespace
+
+TORCH_LIBRARY_IMPL(pyg, CUDA, m) {
+  m.impl(TORCH_SELECTIVE_NAME("pyg::nearest"), TORCH_FN(nearest_cuda));
+}
+
+}  // namespace ops
+}  // namespace pyg
diff --git a/pyg_lib/csrc/ops/nearest.cpp b/pyg_lib/csrc/ops/nearest.cpp
@@ -0,0 +1,38 @@
+#include "nearest.h"
+
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <torch/library.h>
+
+namespace pyg {
+namespace ops {
+
+PYG_API at::Tensor nearest(const at::Tensor& x,
+                           const at::Tensor& y,
+                           const std::optional<at::Tensor>& ptr_x,
+                           const std::optional<at::Tensor>& ptr_y) {
+  at::TensorArg x_arg{x, "x", 0};
+  at::TensorArg y_arg{y, "y", 1};
+  at::CheckedFrom c{"nearest"};
+
+  at::checkAllDefined(c, {x_arg, y_arg});
+
+  auto x_c = x.view({x.size(0), -1}).contiguous();
+  auto y_c = y.view({y.size(0), -1}).contiguous();
+
+  TORCH_CHECK(x_c.size(1) == y_c.size(1),
+              "x and y must have the same feature dim");
+
+  static auto op = c10::Dispatcher::singleton()
+                       .findSchemaOrThrow("pyg::nearest", "")
+                       .typed<decltype(nearest)>();
+  return op.call(x_c, y_c, ptr_x, ptr_y);
+}
+
+TORCH_LIBRARY_FRAGMENT(pyg, m) {
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "pyg::nearest(Tensor x, Tensor y, Tensor? ptr_x=None, "
+      "Tensor? ptr_y=None) -> Tensor"));
+}
+
+}  // namespace ops
+}  // namespace pyg
diff --git a/pyg_lib/csrc/ops/nearest.h b/pyg_lib/csrc/ops/nearest.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include "pyg_lib/csrc/macros.h"
+
+namespace pyg {
+namespace ops {
+
+PYG_API at::Tensor nearest(const at::Tensor& x,
+                           const at::Tensor& y,
+                           const std::optional<at::Tensor>& ptr_x,
+                           const std::optional<at::Tensor>& ptr_y);
+
+}  // namespace ops
+}  // namespace pyg
diff --git a/test/ops/test_nearest.py b/test/ops/test_nearest.py
@@ -0,0 +1,50 @@
+import pytest
+import torch
+
+import pyg_lib
+from pyg_lib.testing import withCUDA
+
+
+@withCUDA
+@pytest.mark.parametrize('dtype', [torch.float, torch.double])
+def test_nearest_basic(dtype: torch.dtype, device: torch.device) -> None:
+    x = torch.tensor([[0.0, 0.0], [3.0, 0.0]], dtype=dtype, device=device)
+    y = torch.tensor([[1.0, 0.0], [2.0, 0.0]], dtype=dtype, device=device)
+
+    out = pyg_lib.ops.nearest(x, y)
+    assert out.shape == (2, )
+    assert out[0].item() == 0  # x[0]=(0,0) nearest to y[0]=(1,0)
+    assert out[1].item() == 1  # x[1]=(3,0) nearest to y[1]=(2,0)
+
+
+@withCUDA
+@pytest.mark.parametrize('dtype', [torch.float, torch.double])
+def test_nearest_correctness(dtype: torch.dtype, device: torch.device) -> None:
+    x = torch.randn(20, 5, dtype=dtype, device=device)
+    y = torch.randn(15, 5, dtype=dtype, device=device)
+
+    out = pyg_lib.ops.nearest(x, y)
+
+    # Reference: cdist + argmin
+    dists = torch.cdist(x.float(), y.float())
+    expected = dists.argmin(dim=1)
+    assert torch.equal(out, expected.to(out.device))
+
+
+@withCUDA
+def test_nearest_batched(device: torch.device) -> None:
+    x = torch.randn(20, 3, device=device)
+    y = torch.randn(15, 3, device=device)
+    ptr_x = torch.tensor([0, 10, 20], dtype=torch.long, device=device)
+    ptr_y = torch.tensor([0, 8, 15], dtype=torch.long, device=device)
+
+    out = pyg_lib.ops.nearest(x, y, ptr_x=ptr_x, ptr_y=ptr_y)
+    assert out.shape == (20, )
+
+    # Batch 0 results should be in [0, 8)
+    assert (out[:10] >= 0).all()
+    assert (out[:10] < 8).all()
+
+    # Batch 1 results should be in [8, 15)
+    assert (out[10:] >= 8).all()
+    assert (out[10:] < 15).all()