Add fps dispatch + CPU kernel (#587)

akihironitta · web-flow · commit b99c438de488 · 2026-03-22T18:01:18.000-07:00
diff --git a/pyg_lib/csrc/ops/cpu/fps_kernel.cpp b/pyg_lib/csrc/ops/cpu/fps_kernel.cpp
@@ -0,0 +1,65 @@
+#include "../fps.h"
+
+#include <ATen/ATen.h>
+#include <ATen/Parallel.h>
+#include <torch/library.h>
+
+namespace pyg {
+namespace ops {
+
+namespace {
+
+at::Tensor fps_kernel(const at::Tensor& src,
+                      const at::Tensor& ptr,
+                      double ratio,
+                      bool random_start) {
+  auto N = src.size(0);
+  auto D = src.size(1);
+  auto batch_size = ptr.numel() - 1;
+
+  auto deg = ptr.narrow(0, 1, batch_size) - ptr.narrow(0, 0, batch_size);
+  auto out_ptr = deg.to(at::kFloat) * ratio;
+  out_ptr = out_ptr.ceil().to(at::kLong).cumsum(0);
+
+  auto out = at::empty({out_ptr[-1].data_ptr<int64_t>()[0]}, ptr.options());
+
+  auto ptr_data = ptr.data_ptr<int64_t>();
+  auto out_ptr_data = out_ptr.data_ptr<int64_t>();
+  auto out_data = out.data_ptr<int64_t>();
+
+  int64_t grain_size = 1;
+  at::parallel_for(0, batch_size, grain_size, [&](int64_t begin, int64_t end) {
+    for (int64_t b = begin; b < end; b++) {
+      auto src_start = ptr_data[b];
+      auto src_end = ptr_data[b + 1];
+      auto out_start = b == 0 ? 0 : out_ptr_data[b - 1];
+      auto out_end = out_ptr_data[b];
+
+      auto y = src.narrow(0, src_start, src_end - src_start);
+
+      int64_t start_idx = 0;
+      if (random_start)
+        start_idx = rand() % y.size(0);
+
+      out_data[out_start] = src_start + start_idx;
+      auto dist = (y - y[start_idx]).pow_(2).sum(1);
+
+      for (int64_t i = 1; i < out_end - out_start; i++) {
+        int64_t argmax = dist.argmax().data_ptr<int64_t>()[0];
+        out_data[out_start + i] = src_start + argmax;
+        dist = at::min(dist, (y - y[argmax]).pow_(2).sum(1));
+      }
+    }
+  });
+
+  return out;
+}
+
+}  // namespace
+
+TORCH_LIBRARY_IMPL(pyg, CPU, m) {
+  m.impl(TORCH_SELECTIVE_NAME("pyg::fps"), TORCH_FN(fps_kernel));
+}
+
+}  // namespace ops
+}  // namespace pyg
diff --git a/pyg_lib/csrc/ops/fps.cpp b/pyg_lib/csrc/ops/fps.cpp
@@ -0,0 +1,38 @@
+#include "fps.h"
+
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <torch/library.h>
+
+namespace pyg {
+namespace ops {
+
+PYG_API at::Tensor fps(const at::Tensor& src,
+                       const at::Tensor& ptr,
+                       double ratio,
+                       bool random_start) {
+  at::TensorArg src_arg{src, "src", 0};
+  at::TensorArg ptr_arg{ptr, "ptr", 1};
+  at::CheckedFrom c{"fps"};
+
+  at::checkAllDefined(c, {src_arg, ptr_arg});
+  at::checkDim(c, ptr_arg, 1);
+
+  TORCH_CHECK(ratio > 0.0 && ratio <= 1.0, "ratio must be in the range (0, 1]");
+
+  auto src_c = src.view({src.size(0), -1}).contiguous();
+  auto ptr_c = ptr.contiguous();
+
+  static auto op = c10::Dispatcher::singleton()
+                       .findSchemaOrThrow("pyg::fps", "")
+                       .typed<decltype(fps)>();
+  return op.call(src_c, ptr_c, ratio, random_start);
+}
+
+TORCH_LIBRARY_FRAGMENT(pyg, m) {
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "pyg::fps(Tensor src, Tensor ptr, float ratio=0.5, "
+      "bool random_start=True) -> Tensor"));
+}
+
+}  // namespace ops
+}  // namespace pyg
diff --git a/pyg_lib/csrc/ops/fps.h b/pyg_lib/csrc/ops/fps.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include "pyg_lib/csrc/macros.h"
+
+namespace pyg {
+namespace ops {
+
+PYG_API at::Tensor fps(const at::Tensor& src,
+                       const at::Tensor& ptr,
+                       double ratio,
+                       bool random_start);
+
+}  // namespace ops
+}  // namespace pyg
diff --git a/pyg_lib/ops/__init__.py b/pyg_lib/ops/__init__.py
@@ -418,6 +418,29 @@ def grid_cluster(
     return torch.ops.pyg.grid_cluster(pos, size, start, end)
 
 
+def fps(
+    src: Tensor,
+    ptr: Tensor,
+    ratio: float = 0.5,
+    random_start: bool = True,
+) -> Tensor:
+    r"""Performs greedy farthest point sampling.
+
+    Starting from a random point (or the first point), iteratively selects
+    the point that is farthest from the already selected set.
+
+    Args:
+        src: Point positions of shape :obj:`[N, D]`.
+        ptr: Batch boundaries as a CSR pointer of shape :obj:`[B + 1]`.
+        ratio: Fraction of points to sample from each batch (in :obj:`(0, 1]`).
+        random_start: If :obj:`True`, starts from a random point.
+
+    Returns:
+        Indices of the sampled points of shape :obj:`[M]`.
+    """
+    return torch.ops.pyg.fps(src, ptr, ratio, random_start)
+
+
 __all__ = [
     'grouped_matmul',
     'segment_matmul',
@@ -430,4 +453,5 @@ def grid_cluster(
     'spline_basis',
     'spline_weighting',
     'grid_cluster',
+    'fps',
 ]
diff --git a/test/ops/test_fps.py b/test/ops/test_fps.py
@@ -0,0 +1,100 @@
+import pytest
+import torch
+
+import pyg_lib
+
+
+@pytest.mark.parametrize('dtype', [torch.float, torch.double])
+def test_fps_output_size(dtype: torch.dtype) -> None:
+    N, D = 20, 3
+    src = torch.randn(N, D, dtype=dtype)
+    ptr = torch.tensor([0, N], dtype=torch.long)
+
+    out = pyg_lib.ops.fps(src, ptr, ratio=0.5, random_start=False)
+    assert out.shape == (10, )
+    assert out.dtype == torch.long
+    # All indices should be within range:
+    assert out.min() >= 0
+    assert out.max() < N
+
+
+@pytest.mark.parametrize('dtype', [torch.float, torch.double])
+def test_fps_farthest_property(dtype: torch.dtype) -> None:
+    # After FPS, the minimum pairwise distance between selected points
+    # should be >= the greedy guarantee.
+    src = torch.randn(50, 3, dtype=dtype)
+    ptr = torch.tensor([0, 50], dtype=torch.long)
+
+    out = pyg_lib.ops.fps(src, ptr, ratio=0.2, random_start=False)
+    selected = src[out]
+    dists = torch.cdist(selected, selected)
+    dists.fill_diagonal_(float('inf'))
+    min_dist = dists.min()
+    assert min_dist > 0
+
+
+def test_fps_multi_batch() -> None:
+    src = torch.randn(30, 3)
+    ptr = torch.tensor([0, 10, 30], dtype=torch.long)
+
+    out = pyg_lib.ops.fps(src, ptr, ratio=0.5, random_start=False)
+    # Batch 0: ceil(10 * 0.5) = 5, Batch 1: ceil(20 * 0.5) = 10
+    assert out.shape == (15, )
+    # First 5 indices in batch 0:
+    assert (out[:5] < 10).all()
+    assert (out[:5] >= 0).all()
+    # Next 10 in batch 1:
+    assert (out[5:] >= 10).all()
+    assert (out[5:] < 30).all()
+
+
+def test_fps_random_start() -> None:
+    src = torch.randn(20, 3)
+    ptr = torch.tensor([0, 20], dtype=torch.long)
+
+    out_det = pyg_lib.ops.fps(src, ptr, ratio=0.5, random_start=False)
+    # Deterministic: first selected index is always 0
+    assert out_det[0] == 0
+
+
+def test_fps_ratio_one() -> None:
+    # ratio=1.0 should return all points.
+    N = 15
+    src = torch.randn(N, 3)
+    ptr = torch.tensor([0, N], dtype=torch.long)
+
+    out = pyg_lib.ops.fps(src, ptr, ratio=1.0, random_start=False)
+    assert out.shape == (N, )
+    assert set(out.tolist()) == set(range(N))
+
+
+def test_fps_single_point_batch() -> None:
+    # Edge case: batch with a single point.
+    src = torch.randn(1, 3)
+    ptr = torch.tensor([0, 1], dtype=torch.long)
+
+    out = pyg_lib.ops.fps(src, ptr, ratio=1.0, random_start=False)
+    assert out.shape == (1, )
+    assert out[0] == 0
+
+
+@pytest.mark.parametrize('dtype', [torch.float, torch.double])
+def test_fps_greedy_property(dtype: torch.dtype) -> None:
+    # Verify the greedy FPS invariant: each selected point (after the first)
+    # must be the farthest from the already-selected set at the time of its
+    # selection.
+    src = torch.randn(30, 3, dtype=dtype)
+    ptr = torch.tensor([0, 30], dtype=torch.long)
+
+    out = pyg_lib.ops.fps(src, ptr, ratio=0.5, random_start=False)
+
+    selected = [out[0].item()]
+    for i in range(1, out.shape[0]):
+        # Minimum distance from each candidate to the selected set so far:
+        sel = src[selected]
+        dists = torch.cdist(src.unsqueeze(0), sel.unsqueeze(0)).squeeze(0)
+        min_dists = dists.min(dim=1).values
+        # The point FPS picked should have the maximum min-distance:
+        expected = min_dists.argmax().item()
+        assert out[i].item() == expected
+        selected.append(out[i].item())