cpu boilerplate

rusty1s · rusty1s · commit ea94e54603de · 2020-01-10T20:25:31.000+01:00
diff --git a/benchmark/gather.py b/benchmark/gather.py
@@ -30,13 +30,16 @@ def correctness(dataset):
 
             assert torch.allclose(out1, out2, atol=1e-4)
             assert torch.allclose(out1, out3, atol=1e-4)
-        except RuntimeError:
+        except RuntimeError as e:
+            if 'out of memory' not in str(e):
+                raise RuntimeError(e)
             torch.cuda.empty_cache()
 
 
 def time_func(func, x):
     try:
-        torch.cuda.synchronize()
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
         t = time.perf_counter()
 
         if not args.with_backward:
@@ -49,9 +52,12 @@ def time_func(func, x):
                 out = func(x)
                 torch.autograd.grad(out, x, out, only_inputs=True)
 
-        torch.cuda.synchronize()
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
         return time.perf_counter() - t
-    except RuntimeError:
+    except RuntimeError as e:
+        if 'out of memory' not in str(e):
+            raise RuntimeError(e)
         torch.cuda.empty_cache()
         return float('inf')
 
@@ -88,7 +94,9 @@ def gat_csr(x):
 
             del x
 
-        except RuntimeError:
+        except RuntimeError as e:
+            if 'out of memory' not in str(e):
+                raise RuntimeError(e)
             torch.cuda.empty_cache()
             for t in (t1, t2, t3, t4):
                 t.append(float('inf'))
diff --git a/benchmark/scatter_segment.py b/benchmark/scatter_segment.py
@@ -82,13 +82,16 @@ def correctness(dataset):
             assert torch.allclose(out1, out2, atol=1e-4)
             assert torch.allclose(out1, out3, atol=1e-4)
 
-        except RuntimeError:
+        except RuntimeError as e:
+            if 'out of memory' not in str(e):
+                raise RuntimeError(e)
             torch.cuda.empty_cache()
 
 
 def time_func(func, x):
     try:
-        torch.cuda.synchronize()
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
         t = time.perf_counter()
 
         if not args.with_backward:
@@ -102,9 +105,12 @@ def time_func(func, x):
                 out = out[0] if isinstance(out, tuple) else out
                 torch.autograd.grad(out, x, out, only_inputs=True)
 
-        torch.cuda.synchronize()
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
         return time.perf_counter() - t
-    except RuntimeError:
+    except RuntimeError as e:
+        if 'out of memory' not in str(e):
+            raise RuntimeError(e)
         torch.cuda.empty_cache()
         return float('inf')
 
@@ -152,7 +158,9 @@ def dense2(x):
 
             del x
 
-        except RuntimeError:
+        except RuntimeError as e:
+            if 'out of memory' not in str(e):
+                raise RuntimeError(e)
             torch.cuda.empty_cache()
             for t in (t1, t2, t3, t4):
                 t.append(float('inf'))
@@ -167,7 +175,9 @@ def dense2(x):
 
             del x
 
-        except RuntimeError:
+        except RuntimeError as e:
+            if 'out of memory' not in str(e):
+                raise RuntimeError(e)
             torch.cuda.empty_cache()
             for t in (t5, t6):
                 t.append(float('inf'))
@@ -197,8 +207,11 @@ def dense2(x):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument('--reduce', type=str, required=True,
-                        choices=['add', 'mean', 'min', 'max'])
+    parser.add_argument(
+        '--reduce',
+        type=str,
+        required=True,
+        choices=['add', 'mean', 'min', 'max'])
     parser.add_argument('--with_backward', action='store_true')
     parser.add_argument('--device', type=str, default='cuda')
     args = parser.parse_args()
diff --git a/cpu/gather.cpp b/cpu/gather.cpp
@@ -0,0 +1,28 @@
+#include <torch/extension.h>
+
+#define CHECK_CPU(x) AT_ASSERTM(!x.type().is_cuda(), #x " must be CPU tensor")
+
+at::Tensor gather_csr(at::Tensor src, at::Tensor indptr,
+                      at::optional<at::Tensor> out_opt) {
+  CHECK_CPU(src);
+  CHECK_CPU(indptr);
+  if (out_opt.has_value())
+    CHECK_CPU(out_opt.value());
+  AT_ASSERTM(false, "Not yet implemented");
+  return src;
+}
+
+at::Tensor gather_coo(at::Tensor src, at::Tensor index,
+                      at::optional<at::Tensor> out_opt) {
+  CHECK_CPU(src);
+  CHECK_CPU(index);
+  if (out_opt.has_value())
+    CHECK_CPU(out_opt.value());
+  AT_ASSERTM(false, "Not yet implemented");
+  return src;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("gather_csr", &gather_csr, "Gather CSR (CPU)");
+  m.def("gather_coo", &gather_coo, "Gather COO (CPU)");
+}
diff --git a/cpu/segment.cpp b/cpu/segment.cpp
@@ -0,0 +1,29 @@
+#include <torch/extension.h>
+
+#define CHECK_CPU(x) AT_ASSERTM(!x.type().is_cuda(), #x " must be CPU tensor")
+
+std::tuple<at::Tensor, at::optional<at::Tensor>>
+segment_csr(at::Tensor src, at::Tensor indptr, at::optional<at::Tensor> out_opt,
+            std::string reduce) {
+  CHECK_CPU(src);
+  CHECK_CPU(indptr);
+  if (out_opt.has_value())
+    CHECK_CPU(out_opt.value());
+  AT_ASSERTM(false, "Not yet implemented");
+  return std::make_tuple(src, at::nullopt);
+}
+
+std::tuple<at::Tensor, at::optional<at::Tensor>>
+segment_coo(at::Tensor src, at::Tensor index, at::Tensor out,
+            std::string reduce) {
+  CHECK_CPU(src);
+  CHECK_CPU(index);
+  CHECK_CPU(out);
+  AT_ASSERTM(false, "Not yet implemented");
+  return std::make_tuple(src, at::nullopt);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("segment_csr", &segment_csr, "Segment CSR (CPU)");
+  m.def("segment_coo", &segment_coo, "Segment COO (CPU)");
+}
diff --git a/setup.py b/setup.py
@@ -25,16 +25,18 @@
 ext_modules = []
 exts = [e.split(osp.sep)[-1][:-4] for e in glob(osp.join('cpu', '*.cpp'))]
 ext_modules += [
-    CppExtension(f'torch_scatter.{ext}_cpu', [f'cpu/{ext}.cpp'],
-                 extra_compile_args=cxx_extra_compile_args) for ext in exts
+    CppExtension(
+        f'torch_scatter.{ext}_cpu', [f'cpu/{ext}.cpp'],
+        extra_compile_args=cxx_extra_compile_args) for ext in exts
 ]
 
 if CUDA_HOME is not None and USE_GPU:
     exts = [e.split(osp.sep)[-1][:-4] for e in glob(osp.join('cuda', '*.cpp'))]
     ext_modules += [
         CUDAExtension(
             f'torch_scatter.{ext}_cuda',
-            [f'cuda/{ext}.cpp', f'cuda/{ext}_kernel.cu'], extra_compile_args={
+            [f'cuda/{ext}.cpp', f'cuda/{ext}_kernel.cu'],
+            extra_compile_args={
                 'cxx': cxx_extra_compile_args,
                 'nvcc': nvcc_extra_compile_args,
             }) for ext in exts
diff --git a/torch_scatter/gather.py b/torch_scatter/gather.py
@@ -1,8 +1,13 @@
 import torch
 
+from torch_scatter import segment_cpu, gather_cpu
+
 if torch.cuda.is_available():
     from torch_scatter import gather_cuda, segment_cuda
 
+gat = lambda is_cuda: gather_cuda if is_cuda else gather_cpu  # noqa
+seg = lambda is_cuda: segment_cuda if is_cuda else segment_cpu  # noqa
+
 
 class GatherCOO(torch.autograd.Function):
     @staticmethod
@@ -12,15 +17,15 @@ def forward(ctx, src, index, out):
         ctx.src_size = list(src.size())
         ctx.save_for_backward(index)
 
-        return gather_cuda.gather_coo(src, index, out)
+        return gat(src.is_cuda).gather_coo(src, index, out)
 
     @staticmethod
     def backward(ctx, grad_out):
         (index, ), src_size = ctx.saved_tensors, ctx.src_size
 
         grad_src = None
         if ctx.needs_input_grad[0]:
-            grad_src, _ = segment_cuda.segment_coo(
+            grad_src, _ = seg(grad_out.is_cuda).segment_coo(
                 grad_out, index, grad_out.new_zeros(src_size), 'add')
 
         return grad_src, None, None
@@ -34,15 +39,15 @@ def forward(ctx, src, indptr, out):
         ctx.src_size = list(src.size())
         ctx.save_for_backward(indptr)
 
-        return gather_cuda.gather_csr(src, indptr, out)
+        return gat(src.is_cuda).gather_csr(src, indptr, out)
 
     @staticmethod
     def backward(ctx, grad_out):
         (indptr, ), src_size = ctx.saved_tensors, ctx.src_size
 
         grad_src = None
         if ctx.needs_input_grad[0]:
-            grad_src, _ = segment_cuda.segment_csr(
+            grad_src, _ = seg(grad_out.is_cuda).segment_csr(
                 grad_out, indptr, grad_out.new_empty(src_size), 'add')
 
         return grad_src, None, None
diff --git a/torch_scatter/segment.py b/torch_scatter/segment.py
@@ -1,10 +1,14 @@
 import torch
 
+from torch_scatter import segment_cpu, gather_cpu
 from torch_scatter.helpers import min_value, max_value
 
 if torch.cuda.is_available():
     from torch_scatter import segment_cuda, gather_cuda
 
+seg = lambda is_cuda: segment_cuda if is_cuda else segment_cpu  # noqa
+gat = lambda is_cuda: gather_cuda if is_cuda else gather_cpu  # noqa
+
 
 class SegmentCOO(torch.autograd.Function):
     @staticmethod
@@ -28,7 +32,7 @@ def forward(ctx, src, index, out, dim_size, reduce):
 
             out = src.new_full(size, fill_value)
 
-        out, arg_out = segment_cuda.segment_coo(src, index, out, reduce)
+        out, arg_out = seg(src.is_cuda).segment_coo(src, index, out, reduce)
 
         if fill_value != 0:
             out.masked_fill_(out == fill_value, 0)
@@ -47,13 +51,13 @@ def backward(ctx, grad_out, *args):
         grad_src = None
         if ctx.needs_input_grad[0]:
             if ctx.reduce == 'add':
-                grad_src = gather_cuda.gather_coo(grad_out, index,
-                                                  grad_out.new_empty(src_size))
+                grad_src = gat(grad_out).gather_coo(
+                    grad_out, index, grad_out.new_empty(src_size))
             elif ctx.reduce == 'mean':
-                grad_src = gather_cuda.gather_coo(grad_out, index,
-                                                  grad_out.new_empty(src_size))
+                grad_src = gat(grad_out).gather_coo(
+                    grad_out, index, grad_out.new_empty(src_size))
                 count = arg_out
-                count = gather_cuda.gather_coo(
+                count = gat(grad_out.is_cuda).gather_coo(
                     count, index, count.new_empty(src_size[:index.dim()]))
                 for _ in range(grad_out.dim() - index.dim()):
                     count = count.unsqueeze(-1)
@@ -78,7 +82,7 @@ def forward(ctx, src, indptr, out, reduce):
         ctx.reduce = reduce
         ctx.src_size = list(src.size())
 
-        out, arg_out = segment_cuda.segment_csr(src, indptr, out, reduce)
+        out, arg_out = seg(src.is_cuda).segment_csr(src, indptr, out, reduce)
         ctx.save_for_backward(indptr, arg_out)
         return out if arg_out is None else (out, arg_out)
 
@@ -89,15 +93,15 @@ def backward(ctx, grad_out, *args):
         grad_src = None
         if ctx.needs_input_grad[0]:
             if ctx.reduce == 'add':
-                grad_src = gather_cuda.gather_csr(grad_out, indptr,
-                                                  grad_out.new_empty(src_size))
+                grad_src = gat(grad_out.is_cuda).gather_csr(
+                    grad_out, indptr, grad_out.new_empty(src_size))
             elif ctx.reduce == 'mean':
-                grad_src = gather_cuda.gather_csr(grad_out, indptr,
-                                                  grad_out.new_empty(src_size))
+                grad_src = gat(grad_out.is_cuda).gather_csr(
+                    grad_out, indptr, grad_out.new_empty(src_size))
                 indptr1 = indptr.narrow(-1, 0, indptr.size(-1) - 1)
                 indptr2 = indptr.narrow(-1, 1, indptr.size(-1) - 1)
                 count = (indptr2 - indptr1).to(grad_src.dtype)
-                count = gather_cuda.gather_csr(
+                count = gat(grad_out.is_cuda).gather_csr(
                     count, indptr, count.new_empty(src_size[:indptr.dim()]))
                 for _ in range(grad_out.dim() - indptr.dim()):
                     count = count.unsqueeze(-1)