rusty1s
diff --git a/‎cpu/dim_apply.h‎
Lines changed: 2 additions & 2 deletions b/‎cpu/dim_apply.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpu/gather.cpp‎
Lines changed: 13 additions & 14 deletions b/‎cpu/gather.cpp‎
Lines changed: 13 additions & 14 deletions
diff --git a/‎cpu/index_info.h‎
Lines changed: 1 addition & 1 deletion b/‎cpu/index_info.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpu/scatter.cpp‎
Lines changed: 13 additions & 14 deletions b/‎cpu/scatter.cpp‎
Lines changed: 13 additions & 14 deletions
diff --git a/‎cpu/segment.cpp‎
Lines changed: 16 additions & 17 deletions b/‎cpu/segment.cpp‎
Lines changed: 16 additions & 17 deletions
diff --git a/‎cuda/gather.cpp‎
Lines changed: 14 additions & 14 deletions b/‎cuda/gather.cpp‎
Lines changed: 14 additions & 14 deletions
diff --git a/‎cuda/gather_kernel.cu‎
Lines changed: 11 additions & 8 deletions b/‎cuda/gather_kernel.cu‎
Lines changed: 11 additions & 8 deletions
diff --git a/‎cuda/index.cuh‎
Lines changed: 1 addition & 1 deletion b/‎cuda/index.cuh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cuda/indptr.cuh‎
Lines changed: 1 addition & 1 deletion b/‎cuda/indptr.cuh‎
Lines changed: 1 addition & 1 deletion
@@ -19,7 +19,7 @@
     auto TENSOR3##_stride = TENSOR3.stride(DIM);                               \
                                                                                \
     auto dims = TENSOR1.dim();                                                 \
-    auto zeros = at::zeros(dims, TENSOR1.options().dtype(at::kLong));          \
+    auto zeros = torch::zeros(dims, TENSOR1.options().dtype(torch::kLong));    \
     auto counter = zeros.DATA_PTR<int64_t>();                                  \
     bool has_finished = false;                                                 \
                                                                                \
@@ -78,7 +78,7 @@
     auto TENSOR4##_stride = TENSOR4.stride(DIM);                               \
                                                                                \
     auto dims = TENSOR1.dim();                                                 \
-    auto zeros = at::zeros(dims, TENSOR1.options().dtype(at::kLong));          \
+    auto zeros = torch::zeros(dims, TENSOR1.options().dtype(torch::kLong));    \
     auto counter = zeros.DATA_PTR<int64_t>();                                  \
     bool has_finished = false;                                                 \
                                                                                \
 
@@ -1,14 +1,14 @@
-#include <torch/extension.h>
+#include <torch/script.h>
 
 #include "compat.h"
 #include "index_info.h"
 
 #include <vector>
 
-#define CHECK_CPU(x) AT_ASSERTM(!x.type().is_cuda(), #x " must be CPU tensor")
+#define CHECK_CPU(x) AT_ASSERTM(x.device().is_cpu(), #x " must be CPU tensor")
 
-at::Tensor gather_csr(at::Tensor src, at::Tensor indptr,
-                      at::optional<at::Tensor> out_opt) {
+torch::Tensor gather_csr(torch::Tensor src, torch::Tensor indptr,
+                         torch::optional<torch::Tensor> out_opt) {
   CHECK_CPU(src);
   CHECK_CPU(indptr);
   if (out_opt.has_value())
@@ -23,7 +23,7 @@ at::Tensor gather_csr(at::Tensor src, at::Tensor indptr,
   AT_ASSERTM(src.size(gather_dim) == indptr.size(gather_dim) - 1,
              "Input mismatch");
 
-  at::Tensor out;
+  torch::Tensor out;
   if (out_opt.has_value()) {
     out = out_opt.value().contiguous();
     for (int i = 0; i < out.dim(); i++)
@@ -32,7 +32,7 @@ at::Tensor gather_csr(at::Tensor src, at::Tensor indptr,
   } else {
     auto sizes = src.sizes().vec();
     sizes[gather_dim] = *indptr.flatten()[-1].DATA_PTR<int64_t>();
-    out = at::empty(sizes, src.options());
+    out = torch::empty(sizes, src.options());
   }
 
   auto N = src.size(gather_dim) * (indptr.numel() / indptr.size(-1));
@@ -68,8 +68,8 @@ at::Tensor gather_csr(at::Tensor src, at::Tensor indptr,
   return out;
 }
 
-at::Tensor gather_coo(at::Tensor src, at::Tensor index,
-                      at::optional<at::Tensor> out_opt) {
+torch::Tensor gather_coo(torch::Tensor src, torch::Tensor index,
+                         torch::optional<torch::Tensor> out_opt) {
   CHECK_CPU(src);
   CHECK_CPU(index);
   if (out_opt.has_value())
@@ -82,7 +82,7 @@ at::Tensor gather_coo(at::Tensor src, at::Tensor index,
   src = src.contiguous();
   auto gather_dim = index.dim() - 1;
 
-  at::Tensor out;
+  torch::Tensor out;
   if (out_opt.has_value()) {
     out = out_opt.value().contiguous();
     for (int i = 0; i < index.dim(); i++)
@@ -92,7 +92,7 @@ at::Tensor gather_coo(at::Tensor src, at::Tensor index,
   } else {
     auto sizes = src.sizes().vec();
     sizes[gather_dim] = index.size(gather_dim);
-    out = at::empty(sizes, src.options());
+    out = torch::empty(sizes, src.options());
   }
 
   auto E_1 = index.numel() / out.size(gather_dim);
@@ -139,7 +139,6 @@ at::Tensor gather_coo(at::Tensor src, at::Tensor index,
   return out;
 }
 
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("gather_csr", &gather_csr, "Gather CSR (CPU)");
-  m.def("gather_coo", &gather_coo, "Gather COO (CPU)");
-}
+static auto registry =
+    torch::RegisterOperators("torch_scatter_cpu::gather_csr", &gather_csr)
+        .op("torch_scatter_cpu::gather_coo", &gather_coo);
@@ -26,7 +26,7 @@ template <typename scalar_t> struct TensorInfo {
 };
 
 template <typename scalar_t>
-TensorInfo<scalar_t> getTensorInfo(const at::Tensor &tensor) {
+TensorInfo<scalar_t> getTensorInfo(const torch::Tensor &tensor) {
   int sizes[MAX_TENSORINFO_DIMS];
   int strides[MAX_TENSORINFO_DIMS];
 
 
@@ -1,10 +1,10 @@
-#include <torch/extension.h>
+#include <torch/script.h>
 
 #include "dim_apply.h"
 
-#define CHECK_CPU(x) AT_ASSERTM(!x.type().is_cuda(), #x " must be CPU tensor")
+#define CHECK_CPU(x) AT_ASSERTM(x.device().is_cpu(), #x " must be CPU tensor")
 
-void scatter_mul(at::Tensor src, at::Tensor index, at::Tensor out,
+void scatter_mul(torch::Tensor src, torch::Tensor index, torch::Tensor out,
                  int64_t dim) {
   CHECK_CPU(src);
   CHECK_CPU(index);
@@ -20,7 +20,7 @@ void scatter_mul(at::Tensor src, at::Tensor index, at::Tensor out,
   });
 }
 
-void scatter_div(at::Tensor src, at::Tensor index, at::Tensor out,
+void scatter_div(torch::Tensor src, torch::Tensor index, torch::Tensor out,
                  int64_t dim) {
   CHECK_CPU(src);
   CHECK_CPU(index);
@@ -36,8 +36,8 @@ void scatter_div(at::Tensor src, at::Tensor index, at::Tensor out,
   });
 }
 
-void scatter_max(at::Tensor src, at::Tensor index, at::Tensor out,
-                 at::Tensor arg, int64_t dim) {
+void scatter_max(torch::Tensor src, torch::Tensor index, torch::Tensor out,
+                 torch::Tensor arg, int64_t dim) {
   CHECK_CPU(src);
   CHECK_CPU(index);
   CHECK_CPU(out);
@@ -56,8 +56,8 @@ void scatter_max(at::Tensor src, at::Tensor index, at::Tensor out,
   });
 }
 
-void scatter_min(at::Tensor src, at::Tensor index, at::Tensor out,
-                 at::Tensor arg, int64_t dim) {
+void scatter_min(torch::Tensor src, torch::Tensor index, torch::Tensor out,
+                 torch::Tensor arg, int64_t dim) {
   CHECK_CPU(src);
   CHECK_CPU(index);
   CHECK_CPU(out);
@@ -77,9 +77,8 @@ void scatter_min(at::Tensor src, at::Tensor index, at::Tensor out,
   });
 }
 
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("scatter_mul", &scatter_mul, "Scatter Mul (CPU)");
-  m.def("scatter_div", &scatter_div, "Scatter Div (CPU)");
-  m.def("scatter_max", &scatter_max, "Scatter Max (CPU)");
-  m.def("scatter_min", &scatter_min, "Scatter Min (CPU)");
-}
+static auto registry =
+    torch::RegisterOperators("torch_scatter_cpu::scatter_mul", &scatter_mul)
+        .op("torch_scatter_cpu::scatter_div", &scatter_div)
+        .op("torch_scatter_cpu::scatter_max", &scatter_max)
+        .op("torch_scatter_cpu::scatter_min", &scatter_min);
@@ -1,11 +1,11 @@
-#include <torch/extension.h>
+#include <torch/script.h>
 
 #include "compat.h"
 #include "index_info.h"
 
 #include <vector>
 
-#define CHECK_CPU(x) AT_ASSERTM(!x.type().is_cuda(), #x " must be CPU tensor")
+#define CHECK_CPU(x) AT_ASSERTM(x.device().is_cpu(), #x " must be CPU tensor")
 
 enum ReductionType { SUM, MEAN, MIN, MAX };
 
@@ -74,9 +74,9 @@ template <typename scalar_t, ReductionType REDUCE> struct Reducer {
   }
 };
 
-std::tuple<at::Tensor, at::optional<at::Tensor>>
-segment_csr(at::Tensor src, at::Tensor indptr, at::optional<at::Tensor> out_opt,
-            std::string reduce) {
+std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
+segment_csr(torch::Tensor src, torch::Tensor indptr,
+            torch::optional<torch::Tensor> out_opt, std::string reduce) {
   CHECK_CPU(src);
   CHECK_CPU(indptr);
   if (out_opt.has_value())
@@ -94,7 +94,7 @@ segment_csr(at::Tensor src, at::Tensor indptr, at::optional<at::Tensor> out_opt,
   src = src.contiguous();
   auto reduce_dim = indptr.dim() - 1;
 
-  at::Tensor out;
+  torch::Tensor out;
   if (out_opt.has_value()) {
     out = out_opt.value().contiguous();
     for (int i = 0; i < out.dim(); i++)
@@ -105,13 +105,13 @@ segment_csr(at::Tensor src, at::Tensor indptr, at::optional<at::Tensor> out_opt,
   } else {
     sizes = src.sizes().vec();
     sizes[reduce_dim] = indptr.size(reduce_dim) - 1;
-    out = at::empty(sizes, src.options());
+    out = torch::empty(sizes, src.options());
   }
 
-  at::optional<at::Tensor> arg_out = at::nullopt;
+  torch::optional<torch::Tensor> arg_out = torch::nullopt;
   int64_t *arg_out_data = nullptr;
   if (reduce2REDUCE.at(reduce) == MIN || reduce2REDUCE.at(reduce) == MAX) {
-    arg_out = at::full_like(out, src.size(reduce_dim), indptr.options());
+    arg_out = torch::full_like(out, src.size(reduce_dim), indptr.options());
     arg_out_data = arg_out.value().DATA_PTR<int64_t>();
   }
 
@@ -156,8 +156,8 @@ segment_csr(at::Tensor src, at::Tensor indptr, at::optional<at::Tensor> out_opt,
   return std::make_tuple(out, arg_out);
 }
 
-std::tuple<at::Tensor, at::optional<at::Tensor>>
-segment_coo(at::Tensor src, at::Tensor index, at::Tensor out,
+std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
+segment_coo(torch::Tensor src, torch::Tensor index, torch::Tensor out,
             std::string reduce) {
   CHECK_CPU(src);
   CHECK_CPU(index);
@@ -180,10 +180,10 @@ segment_coo(at::Tensor src, at::Tensor index, at::Tensor out,
     if (i != reduce_dim)
       AT_ASSERTM(src.size(i) == out.size(i), "Input mismatch");
 
-  at::optional<at::Tensor> arg_out = at::nullopt;
+  torch::optional<torch::Tensor> arg_out = torch::nullopt;
   int64_t *arg_out_data = nullptr;
   if (reduce2REDUCE.at(reduce) == MIN || reduce2REDUCE.at(reduce) == MAX) {
-    arg_out = at::full_like(out, src.size(reduce_dim), index.options());
+    arg_out = torch::full_like(out, src.size(reduce_dim), index.options());
     arg_out_data = arg_out.value().DATA_PTR<int64_t>();
   }
 
@@ -251,7 +251,6 @@ segment_coo(at::Tensor src, at::Tensor index, at::Tensor out,
   return std::make_tuple(out, arg_out);
 }
 
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("segment_csr", &segment_csr, "Segment CSR (CPU)");
-  m.def("segment_coo", &segment_coo, "Segment COO (CPU)");
-}
+static auto registry =
+    torch::RegisterOperators("torch_scatter_cpu::segment_csr", &segment_csr)
+        .op("torch_scatter_cpu::segment_coo", &segment_coo);
@@ -1,31 +1,31 @@
-#include <torch/extension.h>
+#include <torch/script.h>
 
-#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be CUDA tensor")
+#define CHECK_CUDA(x)                                                          \
+  AT_ASSERTM(x.device().is_cuda(), #x " must be CUDA tensor")
 
-at::Tensor gather_csr_cuda(at::Tensor src, at::Tensor indptr,
-                           at::optional<at::Tensor> out_opt);
-at::Tensor gather_coo_cuda(at::Tensor src, at::Tensor index,
-                           at::optional<at::Tensor> out_opt);
+torch::Tensor gather_csr_cuda(torch::Tensor src, torch::Tensor indptr,
+                              torch::optional<torch::Tensor> out_opt);
+torch::Tensor gather_coo_cuda(torch::Tensor src, torch::Tensor index,
+                              torch::optional<torch::Tensor> out_opt);
 
-at::Tensor gather_csr(at::Tensor src, at::Tensor indptr,
-                      at::optional<at::Tensor> out_opt) {
+torch::Tensor gather_csr(torch::Tensor src, torch::Tensor indptr,
+                         torch::optional<torch::Tensor> out_opt) {
   CHECK_CUDA(src);
   CHECK_CUDA(indptr);
   if (out_opt.has_value())
     CHECK_CUDA(out_opt.value());
   return gather_csr_cuda(src, indptr, out_opt);
 }
 
-at::Tensor gather_coo(at::Tensor src, at::Tensor index,
-                      at::optional<at::Tensor> out_opt) {
+torch::Tensor gather_coo(torch::Tensor src, torch::Tensor index,
+                         torch::optional<torch::Tensor> out_opt) {
   CHECK_CUDA(src);
   CHECK_CUDA(index);
   if (out_opt.has_value())
     CHECK_CUDA(out_opt.value());
   return gather_coo_cuda(src, index, out_opt);
 }
 
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("gather_csr", &gather_csr, "Gather CSR (CUDA)");
-  m.def("gather_coo", &gather_coo, "Gather COO (CUDA)");
-}
+static auto registry =
+    torch::RegisterOperators("torch_scatter_cuda::gather_csr", &gather_csr)
+        .op("torch_scatter_cuda::gather_coo", &gather_coo);
@@ -1,7 +1,7 @@
-#include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/detail/IndexUtils.cuh>
 #include <ATen/cuda/detail/TensorInfo.cuh>
+#include <torch/extension.h>
 
 #include "compat.cuh"
 #include "indptr.cuh"
@@ -58,9 +58,10 @@ __global__ void gather_csr_broadcast_kernel(
   }
 }
 
-at::Tensor gather_csr_cuda(at::Tensor src, at::Tensor indptr,
-                           at::optional<at::Tensor> out_opt) {
+torch::Tensor gather_csr_cuda(torch::Tensor src, torch::Tensor indptr,
+                              torch::optional<torch::Tensor> out_opt) {
 
+  cudaSetDevice(src.get_device());
   AT_ASSERTM(src.dim() >= indptr.dim(), "Input mismatch");
   for (int i = 0; i < indptr.dim() - 1; i++)
     AT_ASSERTM(src.size(i) == indptr.size(i), "Input mismatch");
@@ -70,7 +71,7 @@ at::Tensor gather_csr_cuda(at::Tensor src, at::Tensor indptr,
   AT_ASSERTM(src.size(gather_dim) == indptr.size(gather_dim) - 1,
              "Input mismatch");
 
-  at::Tensor out;
+  torch::Tensor out;
   if (out_opt.has_value()) {
     out = out_opt.value().contiguous();
     for (int i = 0; i < out.dim(); i++)
@@ -152,8 +153,10 @@ __global__ void gather_coo_broadcast_kernel(
   }
 }
 
-at::Tensor gather_coo_cuda(at::Tensor src, at::Tensor index,
-                           at::optional<at::Tensor> out_opt) {
+torch::Tensor gather_coo_cuda(torch::Tensor src, torch::Tensor index,
+                              torch::optional<torch::Tensor> out_opt) {
+
+  cudaSetDevice(src.get_device());
 
   AT_ASSERTM(src.dim() >= index.dim(), "Input mismatch");
   for (int i = 0; i < index.dim() - 1; i++)
@@ -162,7 +165,7 @@ at::Tensor gather_coo_cuda(at::Tensor src, at::Tensor index,
   src = src.contiguous();
   auto gather_dim = index.dim() - 1;
 
-  at::Tensor out;
+  torch::Tensor out;
   if (out_opt.has_value()) {
     out = out_opt.value().contiguous();
     for (int i = 0; i < index.dim(); i++)
@@ -172,7 +175,7 @@ at::Tensor gather_coo_cuda(at::Tensor src, at::Tensor index,
   } else {
     auto sizes = src.sizes().vec();
     sizes[gather_dim] = index.size(gather_dim);
-    out = at::empty(sizes, src.options());
+    out = torch::empty(sizes, src.options());
   }
 
   auto E = index.numel();
 
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <ATen/ATen.h>
 #include <ATen/cuda/detail/TensorInfo.cuh>
+#include <torch/extension.h>
 
 template <typename scalar1, typename scalar2, int64_t Dims>
 struct IndexToScatterOffsets3 {
 
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <ATen/ATen.h>
 #include <ATen/cuda/detail/TensorInfo.cuh>
+#include <torch/extension.h>
 
 // We need our own `IndexToOffset` implementation since we do not want to
 // access the last element of the `indexptr`.
Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@`
`19`	`19`	`auto TENSOR3##_stride = TENSOR3.stride(DIM); \`
`20`	`20`	`\`
`21`	`21`	`auto dims = TENSOR1.dim(); \`
`22`		`- auto zeros = at::zeros(dims, TENSOR1.options().dtype(at::kLong)); \`
	`22`	`+ auto zeros = torch::zeros(dims, TENSOR1.options().dtype(torch::kLong)); \`
`23`	`23`	`auto counter = zeros.DATA_PTR<int64_t>(); \`
`24`	`24`	`bool has_finished = false; \`
`25`	`25`	`\`
`@@ -78,7 +78,7 @@`
`78`	`78`	`auto TENSOR4##_stride = TENSOR4.stride(DIM); \`
`79`	`79`	`\`
`80`	`80`	`auto dims = TENSOR1.dim(); \`
`81`		`- auto zeros = at::zeros(dims, TENSOR1.options().dtype(at::kLong)); \`
	`81`	`+ auto zeros = torch::zeros(dims, TENSOR1.options().dtype(torch::kLong)); \`
`82`	`82`	`auto counter = zeros.DATA_PTR<int64_t>(); \`
`83`	`83`	`bool has_finished = false; \`
`84`	`84`	`\`