rusty1s
diff --git a/‎LICENSE‎
Lines changed: 1 addition & 1 deletion b/‎LICENSE‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/cpu/index_info.h‎
Lines changed: 63 additions & 0 deletions b/‎csrc/cpu/index_info.h‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎csrc/cpu/reducer.h‎
Lines changed: 84 additions & 0 deletions b/‎csrc/cpu/reducer.h‎
Lines changed: 84 additions & 0 deletions
diff --git a/‎csrc/cpu/segment_csr_cpu.cpp‎
Lines changed: 147 additions & 0 deletions b/‎csrc/cpu/segment_csr_cpu.cpp‎
Lines changed: 147 additions & 0 deletions
diff --git a/‎csrc/cpu/segment_csr_cpu.h‎
Lines changed: 11 additions & 0 deletions b/‎csrc/cpu/segment_csr_cpu.h‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎csrc/cpu/utils.h‎
Lines changed: 6 additions & 0 deletions b/‎csrc/cpu/utils.h‎
Lines changed: 6 additions & 0 deletions
@@ -1,4 +1,4 @@
-Copyright (c) 2019 Matthias Fey <[email protected]>
+Copyright (c) 2020 Matthias Fey <[email protected]>
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 
@@ -0,0 +1,63 @@
+#pragma once
+
+#include <torch/extension.h>
+
+#define MAX_TENSORINFO_DIMS 25
+
+template <typename scalar_t> struct TensorInfo {
+  TensorInfo(scalar_t *p, int dim, int sz[MAX_TENSORINFO_DIMS],
+             int st[MAX_TENSORINFO_DIMS]) {
+    data = p;
+    dims = dim;
+    AT_ASSERT(dims < MAX_TENSORINFO_DIMS);
+
+    for (int i = 0; i < dim; ++i) {
+      sizes[i] = sz[i];
+      strides[i] = st[i];
+    }
+  }
+
+  scalar_t *data;
+  int dims;
+  int sizes[MAX_TENSORINFO_DIMS];
+  int strides[MAX_TENSORINFO_DIMS];
+};
+
+template <typename scalar_t>
+TensorInfo<scalar_t> getTensorInfo(const torch::Tensor &tensor) {
+  int sizes[MAX_TENSORINFO_DIMS];
+  int strides[MAX_TENSORINFO_DIMS];
+
+  int dims = tensor.dim();
+  for (int i = 0; i < dims; ++i) {
+    sizes[i] = tensor.size(i);
+    strides[i] = tensor.stride(i);
+  }
+
+  return TensorInfo<scalar_t>(tensor.data_ptr<scalar_t>(), dims, sizes,
+                              strides);
+}
+
+template <typename scalar_t> struct IndexToOffset {
+  static inline int get(int idx, const TensorInfo<scalar_t> &info) {
+    int offset = 0;
+    for (int i = info.dims - 1; i >= 0; --i) {
+      offset += (idx % info.sizes[i]) * info.strides[i];
+      idx /= info.sizes[i];
+    }
+    return offset;
+  }
+};
+
+template <typename scalar_t> struct IndexPtrToOffset {
+  static inline int get(int idx, const TensorInfo<scalar_t> &info) {
+    int offset = idx % (info.sizes[info.dims - 1] - 1);
+    offset *= info.strides[info.dims - 1];
+    idx /= info.sizes[info.dims - 1] - 1;
+    for (int i = info.dims - 2; i >= 0; --i) {
+      offset += (idx % info.sizes[i]) * info.strides[i];
+      idx /= info.sizes[i];
+    }
+    return offset;
+  }
+};
@@ -0,0 +1,84 @@
+#pragma once
+
+#include <limits>
+#include <map>
+
+enum ReductionType { SUM, MEAN, MUL, DIV, MIN, MAX };
+
+const std::map<std::string, ReductionType> reduce2REDUCE = {
+    {"sum", SUM}, {"mean", MEAN}, {"mul", MUL},
+    {"div", DIV}, {"min", MIN},   {"max", MAX},
+};
+
+#define AT_DISPATCH_REDUCTION_TYPES(reduce, ...)                               \
+  [&] {                                                                        \
+    switch (reduce2REDUCE.at(reduce)) {                                        \
+    case SUM: {                                                                \
+      const ReductionType REDUCE = SUM;                                        \
+      return __VA_ARGS__();                                                    \
+    }                                                                          \
+    case MEAN: {                                                               \
+      const ReductionType REDUCE = MEAN;                                       \
+      return __VA_ARGS__();                                                    \
+    }                                                                          \
+    case MUL: {                                                                \
+      const ReductionType REDUCE = MUL;                                        \
+      return __VA_ARGS__();                                                    \
+    }                                                                          \
+    case DIV: {                                                                \
+      const ReductionType REDUCE = DIV;                                        \
+      return __VA_ARGS__();                                                    \
+    }                                                                          \
+    case MIN: {                                                                \
+      const ReductionType REDUCE = MIN;                                        \
+      return __VA_ARGS__();                                                    \
+    }                                                                          \
+    case MAX: {                                                                \
+      const ReductionType REDUCE = MAX;                                        \
+      return __VA_ARGS__();                                                    \
+    }                                                                          \
+    }                                                                          \
+  }()
+
+template <typename scalar_t, ReductionType REDUCE> struct Reducer {
+  static inline scalar_t init() {
+    if (REDUCE == MUL || REDUCE == DIV)
+      return (scalar_t)1;
+    else if (REDUCE == MIN)
+      return std::numeric_limits<scalar_t>::max();
+    else if (REDUCE == MAX)
+      return std::numeric_limits<scalar_t>::lowest();
+    else
+      return (scalar_t)0;
+  }
+
+  static inline void update(scalar_t *val, scalar_t new_val, int64_t *arg,
+                            int64_t new_arg) {
+    if (REDUCE == SUM || REDUCE == MEAN)
+      *val = *val + new_val;
+    else if (REDUCE == MUL)
+      *val = *val * new_val;
+    else if (REDUCE == DIV)
+      *val = *val / new_val;
+    else if ((REDUCE == MIN && new_val < *val) ||
+             (REDUCE == MAX && new_val > *val)) {
+      *val = new_val;
+      *arg = new_arg;
+    }
+  }
+
+  static inline void write(scalar_t *address, scalar_t val,
+                           int64_t *arg_address, int64_t arg, int count) {
+    if (REDUCE == SUM || REDUCE == MUL || REDUCE == DIV)
+      *address = val;
+    else if (REDUCE == MEAN)
+      *address = val / (count > 0 ? count : (scalar_t)1);
+    else if (REDUCE == MIN || REDUCE == MAX) {
+      if (count > 0) {
+        *address = val;
+        *arg_address = arg;
+      } else
+        *address = (scalar_t)0;
+    }
+  }
+};
@@ -0,0 +1,147 @@
+#include "segment_csr_cpu.h"
+
+#include "index_info.h"
+#include "reducer.h"
+#include "utils.h"
+
+std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
+segment_csr_cpu(torch::Tensor src, torch::Tensor indptr,
+                torch::optional<torch::Tensor> optional_out,
+                std::string reduce) {
+  CHECK_CPU(src);
+  CHECK_CPU(indptr);
+  if (optional_out.has_value())
+    CHECK_CPU(optional_out.value());
+
+  CHECK_INPUT(src.dim() >= indptr.dim());
+
+  auto sizes = indptr.sizes().vec();
+  for (auto i = 0; i < indptr.dim() - 1; i++)
+    sizes[i] = src.size(i);
+  indptr = indptr.expand(sizes);
+
+  auto dim = indptr.dim() - 1;
+
+  src = src.contiguous();
+
+  torch::Tensor out;
+  if (optional_out.has_value()) {
+    out = optional_out.value().contiguous();
+    for (int i = 0; i < out.dim(); i++)
+      if (i != dim)
+        CHECK_INPUT(src.size(i) == out.size(i));
+    CHECK_INPUT(out.size(dim) == indptr.size(dim) - 1);
+  } else {
+    sizes = src.sizes().vec();
+    sizes[dim] = indptr.size(dim) - 1;
+    out = torch::empty(sizes, src.options());
+  }
+
+  torch::optional<torch::Tensor> arg_out = torch::nullopt;
+  int64_t *arg_out_data = nullptr;
+  if (reduce2REDUCE.at(reduce) == MIN || reduce2REDUCE.at(reduce) == MAX) {
+    arg_out = torch::full(out.sizes(), src.size(dim), indptr.options());
+    arg_out_data = arg_out.value().data_ptr<int64_t>();
+  }
+
+  auto N = out.size(dim) * (indptr.numel() / indptr.size(-1));
+  auto K = out.numel() / N;
+  auto E = src.size(dim);
+
+  auto indptr_info = getTensorInfo<int64_t>(indptr);
+  auto stride = indptr_info.strides[indptr_info.dims - 1];
+  std::vector<int64_t> args(K);
+  AT_DISPATCH_ALL_TYPES(src.scalar_type(), "segment_csr", [&] {
+    auto src_data = src.data_ptr<scalar_t>();
+    auto out_data = out.data_ptr<scalar_t>();
+
+    std::vector<scalar_t> vals(K);
+    int64_t row_start, row_end;
+    AT_DISPATCH_REDUCTION_TYPES(reduce, [&] {
+      for (auto n = 0; n < N; n++) {
+        auto offset = IndexPtrToOffset<int64_t>::get(n, indptr_info);
+        row_start = indptr_info.data[offset];
+        row_end = indptr_info.data[offset + stride];
+
+        offset = (n / (indptr.size(-1) - 1)) * E * K;
+        for (auto k = 0; k < K; k++)
+          vals[k] = Reducer<scalar_t, REDUCE>::init();
+
+        for (auto e = row_start; e < row_end; e++) {
+          CHECK_INPUT(e < E);
+          for (auto k = 0; k < K; k++)
+            Reducer<scalar_t, REDUCE>::update(
+                &vals[k], src_data[offset + e * K + k], &args[k], e);
+        }
+
+        for (auto k = 0; k < K; k++)
+          Reducer<scalar_t, REDUCE>::write(out_data + n * K + k, vals[k],
+                                           arg_out_data + n * K + k, args[k],
+                                           row_end - row_start);
+      }
+    });
+  });
+
+  return std::make_tuple(out, arg_out);
+}
+
+torch::Tensor gather_csr_cpu(torch::Tensor src, torch::Tensor indptr,
+                             torch::optional<torch::Tensor> optional_out) {
+  CHECK_CPU(src);
+  CHECK_CPU(indptr);
+  if (optional_out.has_value())
+    CHECK_CPU(optional_out.value());
+
+  CHECK_INPUT(src.dim() >= indptr.dim());
+
+  auto sizes = indptr.sizes().vec();
+  for (auto i = 0; i < indptr.dim() - 1; i++)
+    sizes[i] = src.size(i);
+  indptr = indptr.expand(sizes);
+
+  auto dim = indptr.dim() - 1;
+  CHECK_INPUT(src.size(dim) == indptr.size(dim) - 1);
+
+  src = src.contiguous();
+
+  torch::Tensor out;
+  if (optional_out.has_value()) {
+    out = optional_out.value().contiguous();
+    for (auto i = 0; i < out.dim(); i++)
+      if (i != dim)
+        CHECK_INPUT(src.size(i) == out.size(i));
+  } else {
+    auto sizes = src.sizes().vec();
+    sizes[dim] = *indptr.flatten()[-1].data_ptr<int64_t>();
+    out = torch::empty(sizes, src.options());
+  }
+
+  auto N = src.size(dim) * (indptr.numel() / indptr.size(-1));
+  auto K = src.numel() / N;
+  auto E = out.size(dim);
+
+  auto indptr_info = getTensorInfo<int64_t>(indptr);
+  auto stride = indptr_info.strides[indptr_info.dims - 1];
+  AT_DISPATCH_ALL_TYPES(src.scalar_type(), "gather_csr", [&] {
+    auto src_data = src.data_ptr<scalar_t>();
+    auto out_data = out.data_ptr<scalar_t>();
+
+    std::vector<scalar_t> vals(K);
+    int64_t row_start, row_end;
+    for (int n = 0; n < N; n++) {
+      auto offset = IndexPtrToOffset<int64_t>::get(n, indptr_info);
+      row_start = indptr_info.data[offset];
+      row_end = indptr_info.data[offset + stride];
+
+      for (auto k = 0; k < K; k++)
+        vals[k] = src_data[n * K + k];
+
+      offset = (n / (indptr.size(-1) - 1)) * E * K;
+      for (auto e = row_start; e < row_end; e++)
+        for (auto k = 0; k < K; k++)
+          out_data[offset + e * K + k] = vals[k];
+    }
+  });
+
+  return out;
+}
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/extension.h>
+
+std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
+segment_csr_cpu(torch::Tensor src, torch::Tensor indptr,
+                torch::optional<torch::Tensor> optional_out,
+                std::string reduce);
+
+torch::Tensor gather_csr_cpu(torch::Tensor src, torch::Tensor indptr,
+                             torch::optional<torch::Tensor> optional_out);
@@ -0,0 +1,6 @@
+#pragma once
+
+#include <torch/extension.h>
+
+#define CHECK_CPU(x) AT_ASSERTM(x.device().is_cpu(), #x " must be CPU tensor")
+#define CHECK_INPUT(x) AT_ASSERTM(x, "Input mismatch")
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-Copyright (c) 2019 Matthias Fey <[email protected]>`
	`1`	`+Copyright (c) 2020 Matthias Fey <[email protected]>`
`2`	`2`
`3`	`3`	`Permission is hereby granted, free of charge, to any person obtaining a copy`
`4`	`4`	`of this software and associated documentation files (the "Software"), to deal`