Add edge_sample dispatch and CPU kernel (#594)

akihironitta · web-flow · commit 6fec0f00d202 · 2026-03-23T06:43:58.000-07:00
diff --git a/pyg_lib/csrc/ops/cpu/edge_sampler_kernel.cpp b/pyg_lib/csrc/ops/cpu/edge_sampler_kernel.cpp
@@ -0,0 +1,64 @@
+#include "../edge_sampler.h"
+
+#include <ATen/ATen.h>
+#include <torch/library.h>
+
+#include <cmath>
+#include <unordered_set>
+#include <vector>
+
+namespace pyg {
+namespace ops {
+
+namespace {
+
+at::Tensor edge_sample_kernel(const at::Tensor& start,
+                              const at::Tensor& rowptr,
+                              int64_t count,
+                              double factor) {
+  auto start_data = start.data_ptr<int64_t>();
+  auto rowptr_data = rowptr.data_ptr<int64_t>();
+
+  std::vector<int64_t> e_ids;
+
+  for (int64_t i = 0; i < start.size(0); i++) {
+    auto row_start = rowptr_data[start_data[i]];
+    auto row_end = rowptr_data[start_data[i] + 1];
+    auto num_neighbors = row_end - row_start;
+
+    int64_t size = count;
+    if (count < 1)
+      size = static_cast<int64_t>(std::ceil(factor * double(num_neighbors)));
+    if (size > num_neighbors)
+      size = num_neighbors;
+
+    if (size < 0.7 * double(num_neighbors)) {
+      std::unordered_set<int64_t> set;
+      while (static_cast<int64_t>(set.size()) < size) {
+        int64_t sample = std::rand() % num_neighbors;
+        set.insert(sample + row_start);
+      }
+      std::vector<int64_t> v(set.begin(), set.end());
+      e_ids.insert(e_ids.end(), v.begin(), v.end());
+    } else {
+      auto sample = at::randperm(num_neighbors, start.options());
+      auto sample_data = sample.data_ptr<int64_t>();
+      for (int64_t j = 0; j < size; j++) {
+        e_ids.push_back(sample_data[j] + row_start);
+      }
+    }
+  }
+
+  int64_t length = static_cast<int64_t>(e_ids.size());
+  return at::from_blob(e_ids.data(), {length}, start.options()).clone();
+}
+
+}  // namespace
+
+TORCH_LIBRARY_IMPL(pyg, CPU, m) {
+  m.impl(TORCH_SELECTIVE_NAME("pyg::edge_sample"),
+         TORCH_FN(edge_sample_kernel));
+}
+
+}  // namespace ops
+}  // namespace pyg
diff --git a/pyg_lib/csrc/ops/edge_sampler.cpp b/pyg_lib/csrc/ops/edge_sampler.cpp
@@ -0,0 +1,34 @@
+#include "edge_sampler.h"
+
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <torch/library.h>
+
+namespace pyg {
+namespace ops {
+
+PYG_API at::Tensor edge_sample(const at::Tensor& start,
+                               const at::Tensor& rowptr,
+                               int64_t count,
+                               double factor) {
+  at::TensorArg start_arg{start, "start", 0};
+  at::TensorArg rowptr_arg{rowptr, "rowptr", 1};
+  at::CheckedFrom c{"edge_sample"};
+
+  at::checkAllDefined(c, {start_arg, rowptr_arg});
+  at::checkDim(c, start_arg, 1);
+  at::checkDim(c, rowptr_arg, 1);
+
+  static auto op = c10::Dispatcher::singleton()
+                       .findSchemaOrThrow("pyg::edge_sample", "")
+                       .typed<decltype(edge_sample)>();
+  return op.call(start, rowptr, count, factor);
+}
+
+TORCH_LIBRARY_FRAGMENT(pyg, m) {
+  m.def(
+      TORCH_SELECTIVE_SCHEMA("pyg::edge_sample(Tensor start, Tensor rowptr, "
+                             "int count=0, float factor=1.0) -> Tensor"));
+}
+
+}  // namespace ops
+}  // namespace pyg
diff --git a/pyg_lib/csrc/ops/edge_sampler.h b/pyg_lib/csrc/ops/edge_sampler.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include "pyg_lib/csrc/macros.h"
+
+namespace pyg {
+namespace ops {
+
+PYG_API at::Tensor edge_sample(const at::Tensor& start,
+                               const at::Tensor& rowptr,
+                               int64_t count,
+                               double factor);
+
+}  // namespace ops
+}  // namespace pyg
diff --git a/pyg_lib/ops/__init__.py b/pyg_lib/ops/__init__.py
@@ -543,6 +543,30 @@ def graclus_cluster(
     return torch.ops.pyg.graclus_cluster(rowptr, col, weight)
 
 
+def edge_sample(
+    start: Tensor,
+    rowptr: Tensor,
+    count: int = 0,
+    factor: float = 1.0,
+) -> Tensor:
+    r"""Samples edges incident to the given start nodes.
+
+    For each start node, samples up to :obj:`count` edges. If
+    :obj:`count < 1`, samples :obj:`ceil(factor * degree)` edges instead.
+
+    Args:
+        start: Start node indices of shape :obj:`[S]`.
+        rowptr: CSR row pointer of shape :obj:`[N + 1]`.
+        count: Fixed number of edges to sample per node. If :obj:`< 1`,
+            uses :obj:`factor` instead.
+        factor: Fraction of edges to sample when :obj:`count < 1`.
+
+    Returns:
+        Sampled edge indices (into the edge list).
+    """
+    return torch.ops.pyg.edge_sample(start, rowptr, count, factor)
+
+
 __all__ = [
     'grouped_matmul',
     'segment_matmul',
@@ -560,4 +584,5 @@ def graclus_cluster(
     'radius',
     'nearest',
     'graclus_cluster',
+    'edge_sample',
 ]
diff --git a/test/ops/test_edge_sampler.py b/test/ops/test_edge_sampler.py
@@ -0,0 +1,52 @@
+import torch
+
+import pyg_lib
+from pyg_lib.testing import withCUDA
+
+
+@withCUDA
+def test_edge_sample_count(device: torch.device) -> None:
+    if device.type == 'cuda':
+        return  # CPU only
+    # Graph: node 0 has 3 edges, node 1 has 2 edges
+    rowptr = torch.tensor([0, 3, 5], dtype=torch.long, device=device)
+    start = torch.tensor([0, 1], dtype=torch.long, device=device)
+
+    out = pyg_lib.ops.edge_sample(start, rowptr, count=2)
+    assert out.numel() == 4  # 2 per node * 2 nodes
+
+    # All sampled edge indices should be valid
+    assert (out >= 0).all()
+    assert (out < 5).all()
+
+    # Node 0 edges in [0, 3), node 1 edges in [3, 5)
+    node0_edges = out[:2]
+    node1_edges = out[2:]
+    assert (node0_edges < 3).all()
+    assert (node1_edges >= 3).all()
+
+
+@withCUDA
+def test_edge_sample_factor(device: torch.device) -> None:
+    if device.type == 'cuda':
+        return  # CPU only
+    # Node with 10 edges
+    rowptr = torch.tensor([0, 10], dtype=torch.long, device=device)
+    start = torch.tensor([0], dtype=torch.long, device=device)
+
+    out = pyg_lib.ops.edge_sample(start, rowptr, count=0, factor=0.5)
+    # ceil(0.5 * 10) = 5
+    assert out.numel() == 5
+
+
+@withCUDA
+def test_edge_sample_cap(device: torch.device) -> None:
+    if device.type == 'cuda':
+        return  # CPU only
+    # Node with 3 edges, request 10
+    rowptr = torch.tensor([0, 3], dtype=torch.long, device=device)
+    start = torch.tensor([0], dtype=torch.long, device=device)
+
+    out = pyg_lib.ops.edge_sample(start, rowptr, count=10)
+    # Capped at degree = 3
+    assert out.numel() == 3