pyg-team
diff --git a/‎pyg_lib/csrc/ops/cpu/graclus_kernel.cpp‎
Lines changed: 90 additions & 0 deletions b/‎pyg_lib/csrc/ops/cpu/graclus_kernel.cpp‎
Lines changed: 90 additions & 0 deletions
diff --git a/‎pyg_lib/csrc/ops/cuda/graclus_kernel.cu‎
Lines changed: 260 additions & 0 deletions b/‎pyg_lib/csrc/ops/cuda/graclus_kernel.cu‎
Lines changed: 260 additions & 0 deletions
diff --git a/‎pyg_lib/csrc/ops/graclus.cpp‎
Lines changed: 39 additions & 0 deletions b/‎pyg_lib/csrc/ops/graclus.cpp‎
Lines changed: 39 additions & 0 deletions
@@ -0,0 +1,90 @@
+#include "../graclus.h"
+
+#include <ATen/ATen.h>
+#include <torch/library.h>
+
+namespace pyg {
+namespace ops {
+
+namespace {
+
+at::Tensor graclus_kernel(const at::Tensor& rowptr,
+                          const at::Tensor& col,
+                          const std::optional<at::Tensor>& weight) {
+  int64_t num_nodes = rowptr.numel() - 1;
+  auto out = at::full({num_nodes}, -1, rowptr.options());
+  auto node_perm = at::randperm(num_nodes, rowptr.options());
+
+  auto rowptr_data = rowptr.data_ptr<int64_t>();
+  auto col_data = col.data_ptr<int64_t>();
+  auto node_perm_data = node_perm.data_ptr<int64_t>();
+  auto out_data = out.data_ptr<int64_t>();
+
+  if (!weight.has_value()) {
+    for (int64_t n = 0; n < num_nodes; n++) {
+      auto u = node_perm_data[n];
+
+      if (out_data[u] >= 0)
+        continue;
+
+      out_data[u] = u;
+
+      int64_t row_start = rowptr_data[u], row_end = rowptr_data[u + 1];
+
+      for (int64_t e = 0; e < row_end - row_start; e++) {
+        auto v = col_data[row_start + e];
+
+        if (out_data[v] >= 0)
+          continue;
+
+        out_data[u] = std::min(u, v);
+        out_data[v] = std::min(u, v);
+        break;
+      }
+    }
+  } else {
+    auto scalar_type = weight.value().scalar_type();
+    AT_DISPATCH_ALL_TYPES_AND2(
+        at::ScalarType::Half, at::ScalarType::BFloat16, scalar_type,
+        "graclus_cpu", [&] {
+          auto weight_data = weight.value().data_ptr<scalar_t>();
+
+          for (int64_t n = 0; n < num_nodes; n++) {
+            auto u = node_perm_data[n];
+
+            if (out_data[u] >= 0)
+              continue;
+
+            auto v_max = u;
+            scalar_t w_max = (scalar_t)0.;
+
+            for (int64_t e = rowptr_data[u]; e < rowptr_data[u + 1]; e++) {
+              auto v = col_data[e];
+
+              if (out_data[v] >= 0)
+                continue;
+
+              if (weight_data[e] >= w_max) {
+                v_max = v;
+                w_max = weight_data[e];
+              }
+            }
+
+            out_data[u] = std::min(u, v_max);
+            out_data[v_max] = std::min(u, v_max);
+          }
+        });
+  }
+
+  return out;
+}
+
+}  // namespace
+
+TORCH_LIBRARY_IMPL(pyg, CPU, m) {
+  m.impl(TORCH_SELECTIVE_NAME("pyg::graclus_cluster"),
+         TORCH_FN(graclus_kernel));
+}
+
+}  // namespace ops
+}  // namespace pyg
@@ -0,0 +1,260 @@
+#include "../graclus.h"
+#include "utils.cuh"
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/library.h>
+
+namespace pyg {
+namespace ops {
+
+namespace {
+
+#define GRACLUS_THREADS 256
+#define GRACLUS_BLOCKS(N) ((N) + GRACLUS_THREADS - 1) / GRACLUS_THREADS
+#define BLUE_P 0.53406
+
+__device__ bool done_d;
+
+__global__ void init_done_kernel() {
+  done_d = true;
+}
+
+__global__ void colorize_kernel(int64_t* out,
+                                const float* bernoulli,
+                                int64_t numel) {
+  const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < numel) {
+    if (out[idx] < 0) {
+      out[idx] = (int64_t)bernoulli[idx] - 2;
+      done_d = false;
+    }
+  }
+}
+
+bool colorize(at::Tensor out) {
+  auto stream = at::cuda::getCurrentCUDAStream();
+  init_done_kernel<<<1, 1, 0, stream>>>();
+
+  auto numel = out.size(0);
+  auto props = at::full({numel}, BLUE_P, out.options().dtype(at::kFloat));
+  auto bernoulli = props.bernoulli();
+
+  colorize_kernel<<<GRACLUS_BLOCKS(numel), GRACLUS_THREADS, 0, stream>>>(
+      out.data_ptr<int64_t>(), bernoulli.data_ptr<float>(), numel);
+
+  bool done_h;
+  cudaMemcpyFromSymbol(&done_h, done_d, sizeof(done_h), 0,
+                       cudaMemcpyDeviceToHost);
+  return done_h;
+}
+
+__global__ void propose_kernel(int64_t* out,
+                               int64_t* proposal,
+                               const int64_t* rowptr,
+                               const int64_t* col,
+                               int64_t numel) {
+  const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < numel) {
+    if (out[idx] != -1)
+      return;
+
+    bool has_unmatched_neighbor = false;
+
+    for (int64_t i = rowptr[idx]; i < rowptr[idx + 1]; i++) {
+      auto v = col[i];
+
+      if (out[v] < 0)
+        has_unmatched_neighbor = true;
+
+      if (out[v] == -2) {
+        proposal[idx] = v;
+        break;
+      }
+    }
+
+    if (!has_unmatched_neighbor)
+      out[idx] = idx;
+  }
+}
+
+template <typename scalar_t>
+__global__ void weighted_propose_kernel(int64_t* out,
+                                        int64_t* proposal,
+                                        const int64_t* rowptr,
+                                        const int64_t* col,
+                                        const scalar_t* weight,
+                                        int64_t numel) {
+  const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < numel) {
+    if (out[idx] != -1)
+      return;
+
+    bool has_unmatched_neighbor = false;
+    int64_t v_max = -1;
+    scalar_t w_max = 0;
+
+    for (int64_t i = rowptr[idx]; i < rowptr[idx + 1]; i++) {
+      auto v = col[i];
+
+      if (out[v] < 0)
+        has_unmatched_neighbor = true;
+
+      if (out[v] == -2 && scalar_ge(weight[i], w_max)) {
+        v_max = v;
+        w_max = weight[i];
+      }
+    }
+
+    proposal[idx] = v_max;
+
+    if (!has_unmatched_neighbor)
+      out[idx] = idx;
+  }
+}
+
+void propose(at::Tensor out,
+             at::Tensor proposal,
+             at::Tensor rowptr,
+             at::Tensor col,
+             const std::optional<at::Tensor>& weight) {
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  if (!weight.has_value()) {
+    propose_kernel<<<GRACLUS_BLOCKS(out.numel()), GRACLUS_THREADS, 0, stream>>>(
+        out.data_ptr<int64_t>(), proposal.data_ptr<int64_t>(),
+        rowptr.data_ptr<int64_t>(), col.data_ptr<int64_t>(), out.numel());
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  } else {
+    auto w = weight.value();
+    AT_DISPATCH_FLOATING_TYPES(w.scalar_type(), "_", [&] {
+      weighted_propose_kernel<scalar_t>
+          <<<GRACLUS_BLOCKS(out.numel()), GRACLUS_THREADS, 0, stream>>>(
+              out.data_ptr<int64_t>(), proposal.data_ptr<int64_t>(),
+              rowptr.data_ptr<int64_t>(), col.data_ptr<int64_t>(),
+              w.data_ptr<scalar_t>(), out.numel());
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+    });
+  }
+}
+
+__global__ void respond_kernel(int64_t* out,
+                               const int64_t* proposal,
+                               const int64_t* rowptr,
+                               const int64_t* col,
+                               int64_t numel) {
+  const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < numel) {
+    if (out[idx] != -2)
+      return;
+
+    bool has_unmatched_neighbor = false;
+
+    for (int64_t i = rowptr[idx]; i < rowptr[idx + 1]; i++) {
+      auto v = col[i];
+
+      if (out[v] < 0)
+        has_unmatched_neighbor = true;
+
+      if (out[v] == -1 && proposal[v] == idx) {
+        int64_t m = idx < v ? idx : v;
+        out[idx] = m;
+        out[v] = m;
+        break;
+      }
+    }
+
+    if (!has_unmatched_neighbor)
+      out[idx] = idx;
+  }
+}
+
+template <typename scalar_t>
+__global__ void weighted_respond_kernel(int64_t* out,
+                                        const int64_t* proposal,
+                                        const int64_t* rowptr,
+                                        const int64_t* col,
+                                        const scalar_t* weight,
+                                        int64_t numel) {
+  const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < numel) {
+    if (out[idx] != -2)
+      return;
+
+    bool has_unmatched_neighbor = false;
+    int64_t v_max = -1;
+    scalar_t w_max = 0;
+
+    for (int64_t i = rowptr[idx]; i < rowptr[idx + 1]; i++) {
+      auto v = col[i];
+
+      if (out[v] < 0)
+        has_unmatched_neighbor = true;
+
+      if (out[v] == -1 && proposal[v] == idx && scalar_ge(weight[i], w_max)) {
+        v_max = v;
+        w_max = weight[i];
+      }
+    }
+
+    if (v_max >= 0) {
+      int64_t m = idx < v_max ? idx : v_max;
+      out[idx] = m;
+      out[v_max] = m;
+    }
+
+    if (!has_unmatched_neighbor)
+      out[idx] = idx;
+  }
+}
+
+void respond(at::Tensor out,
+             at::Tensor proposal,
+             at::Tensor rowptr,
+             at::Tensor col,
+             const std::optional<at::Tensor>& weight) {
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  if (!weight.has_value()) {
+    respond_kernel<<<GRACLUS_BLOCKS(out.numel()), GRACLUS_THREADS, 0, stream>>>(
+        out.data_ptr<int64_t>(), proposal.data_ptr<int64_t>(),
+        rowptr.data_ptr<int64_t>(), col.data_ptr<int64_t>(), out.numel());
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  } else {
+    auto w = weight.value();
+    AT_DISPATCH_FLOATING_TYPES(w.scalar_type(), "_", [&] {
+      weighted_respond_kernel<scalar_t>
+          <<<GRACLUS_BLOCKS(out.numel()), GRACLUS_THREADS, 0, stream>>>(
+              out.data_ptr<int64_t>(), proposal.data_ptr<int64_t>(),
+              rowptr.data_ptr<int64_t>(), col.data_ptr<int64_t>(),
+              w.data_ptr<scalar_t>(), out.numel());
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+    });
+  }
+}
+
+at::Tensor graclus_cuda(const at::Tensor& rowptr,
+                        const at::Tensor& col,
+                        const std::optional<at::Tensor>& weight) {
+  TORCH_CHECK(rowptr.is_cuda() && col.is_cuda(), "Inputs must be CUDA tensors");
+
+  int64_t num_nodes = rowptr.numel() - 1;
+  auto out = at::full({num_nodes}, -1, rowptr.options());
+  auto proposal = at::full({num_nodes}, -1, rowptr.options());
+
+  while (!colorize(out)) {
+    propose(out, proposal, rowptr, col, weight);
+    respond(out, proposal, rowptr, col, weight);
+  }
+
+  return out;
+}
+
+}  // namespace
+
+TORCH_LIBRARY_IMPL(pyg, CUDA, m) {
+  m.impl(TORCH_SELECTIVE_NAME("pyg::graclus_cluster"), TORCH_FN(graclus_cuda));
+}
+
+}  // namespace ops
+}  // namespace pyg
@@ -0,0 +1,39 @@
+#include "graclus.h"
+
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <torch/library.h>
+
+namespace pyg {
+namespace ops {
+
+PYG_API at::Tensor graclus_cluster(const at::Tensor& rowptr,
+                                   const at::Tensor& col,
+                                   const std::optional<at::Tensor>& weight) {
+  at::TensorArg rowptr_arg{rowptr, "rowptr", 0};
+  at::TensorArg col_arg{col, "col", 1};
+  at::CheckedFrom c{"graclus_cluster"};
+
+  at::checkAllDefined(c, {rowptr_arg, col_arg});
+  at::checkDim(c, rowptr_arg, 1);
+  at::checkDim(c, col_arg, 1);
+
+  if (weight.has_value()) {
+    TORCH_CHECK(weight.value().dim() == 1, "weight must be 1-dimensional");
+    TORCH_CHECK(weight.value().numel() == col.numel(),
+                "weight must have the same number of elements as col");
+  }
+
+  static auto op = c10::Dispatcher::singleton()
+                       .findSchemaOrThrow("pyg::graclus_cluster", "")
+                       .typed<decltype(graclus_cluster)>();
+  return op.call(rowptr, col, weight);
+}
+
+TORCH_LIBRARY_FRAGMENT(pyg, m) {
+  m.def(
+      TORCH_SELECTIVE_SCHEMA("pyg::graclus_cluster(Tensor rowptr, Tensor col, "
+                             "Tensor? weight=None) -> Tensor"));
+}
+
+}  // namespace ops
+}  // namespace pyg