Skip to content

Commit 9e4789c

Browse files
authored
Merge pull request #78 from InfiniTensor/dev-hardsigmoid
add HardSigmoid cpu/cuda kernel
2 parents 8b6333d + a7ed032 commit 9e4789c

File tree

15 files changed

+464
-1
lines changed

15 files changed

+464
-1
lines changed
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#ifndef KERNEL_HARD_SIGMOIG_H
2+
#define KERNEL_HARD_SIGMOIG_H
3+
4+
#include "../collector.h"
5+
6+
namespace refactor::kernel {
7+
8+
struct HardSigmoidCollector final : public InfoCollector {
9+
float alpha, beta;
10+
11+
constexpr HardSigmoidCollector(decltype(_target) target, float alpha_, float beta_) noexcept
12+
: InfoCollector(target), alpha(alpha_), beta(beta_) {}
13+
14+
std::vector<KernelBox>
15+
filter(TensorRefs inputs, TensorRefs outputs) const final;
16+
};
17+
}// namespace refactor::kernel
18+
19+
#endif// KERNEL_HARD_SIGMOIG_H
20+
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#include "kernel/collectors/hard_sigmoid.h"
2+
#include "../kernels/hard_sigmoid/cpu_kernel.hh"
3+
#include "../kernels/hard_sigmoid/cuda_kernel.hh"
4+
5+
namespace refactor::kernel {
6+
7+
std::vector<KernelBox>
8+
HardSigmoidCollector::filter(TensorRefs inputs, TensorRefs outputs) const {
9+
auto const &a = inputs[0];
10+
11+
std::vector<KernelBox> ans;
12+
switch (_target) {
13+
case decltype(_target)::Cpu:
14+
if (auto ptr = HardSigmoidCpu::build(alpha, beta, a); ptr) {
15+
ans.emplace_back(std::move(ptr));
16+
}
17+
break;
18+
case decltype(_target)::Nvidia:
19+
if (auto ptr = HardSigmoidCuda::build(alpha, beta, a); ptr) {
20+
ans.emplace_back(std::move(ptr));
21+
}
22+
break;
23+
default:
24+
UNREACHABLEX(void, "Unknown target");
25+
}
26+
return ans;
27+
}
28+
29+
}// namespace refactor::kernel
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
#include "cpu_kernel.hh"
2+
#include <execution>
3+
4+
namespace refactor::kernel {
5+
using K = HardSigmoidCpu;
6+
using DT = DataType;
7+
8+
K::HardSigmoidCpu(float alpha_, float beta_, DT dataType_, size_t size_) noexcept
9+
: Kernel(), alpha(alpha_), beta(beta_), dataType(dataType_), size(size_) {}
10+
11+
auto K::build(float alpha_, float beta_, Tensor const &a) noexcept -> KernelBox {
12+
if (!a.dataType.isCpuNumberic()) {
13+
return nullptr;
14+
}
15+
return std::make_unique<K>(alpha_, beta_, a.dataType, a.elementsSize());
16+
}
17+
18+
auto K::typeId() noexcept -> size_t {
19+
static uint8_t ID = 1;
20+
return reinterpret_cast<size_t>(&ID);
21+
}
22+
23+
auto K::kernelTypeId() const noexcept -> size_t { return typeId(); }
24+
auto K::description() const noexcept -> std::string_view {
25+
return "Performing HardSigmoid using CPU";
26+
}
27+
28+
template<class T>
29+
static Routine lowerTyped(float alpha_, float beta_, size_t size) {
30+
using namespace runtime;
31+
32+
return [=](Resources &, void *workspace, void const *const *inputs, void *const *outputs) {
33+
auto x = reinterpret_cast<T const *>(inputs[0]);
34+
auto y = reinterpret_cast<T *>(outputs[0]);
35+
std::for_each_n(std::execution::par_unseq,
36+
natural_t(0), size,
37+
[&](auto i) {
38+
y[i] = std::clamp(alpha_ * x[i] + beta_, static_cast<T>(0), static_cast<T>(1));
39+
});
40+
};
41+
}
42+
43+
auto K::lower(Resources &) const noexcept -> RoutineWorkspace {
44+
switch (dataType) {
45+
case DT::F32:
46+
return lowerTyped<float>(alpha, beta, size);
47+
case DT::F64:
48+
return lowerTyped<double>(alpha, beta, size);
49+
default:
50+
UNREACHABLE();
51+
}
52+
}
53+
}// namespace refactor::kernel
54+
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#ifndef KERNEL_HARD_SIGMOID_CPU_KERNEL_HH
2+
#define KERNEL_HARD_SIGMOID_CPU_KERNEL_HH
3+
4+
#include "kernel/collectors/hard_sigmoid.h"
5+
#include "kernel/tensor.h"
6+
7+
namespace refactor::kernel {
8+
9+
struct HardSigmoidCpu final : public Kernel {
10+
float alpha, beta;
11+
DataType dataType;
12+
size_t size;
13+
14+
explicit HardSigmoidCpu(float, float, DataType, size_t) noexcept;
15+
16+
static KernelBox build(float, float, Tensor const &) noexcept;
17+
static size_t typeId() noexcept;
18+
19+
size_t kernelTypeId() const noexcept final;
20+
std::string_view description() const noexcept final;
21+
RoutineWorkspace lower(Resources &) const noexcept final;
22+
};
23+
24+
}// namespace refactor::kernel
25+
26+
#endif// KERNEL_HARD_SIGMOID_CPU_KERNEL_HH
27+
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
#include "cuda_kernel.hh"
2+
3+
#ifdef USE_CUDA
4+
#include "../../generator/nvrtc_repo.h"
5+
#include "kernel/cuda/threads_distributer.cuh"
6+
#include <cuda_runtime.h>
7+
#endif
8+
9+
namespace refactor::kernel {
10+
using K = HardSigmoidCuda;
11+
using DT = DataType;
12+
13+
K::HardSigmoidCuda(float alpha_, float beta_, DT dt_, size_t size_) noexcept
14+
: Kernel(), alpha(alpha_), beta(beta_), dataType(dt_), size(size_) {}
15+
16+
auto K::build(float alpha_, float beta_, Tensor const &a) noexcept -> KernelBox {
17+
#ifndef USE_CUDA
18+
return nullptr;
19+
#endif
20+
return std::make_unique<K>(alpha_, beta_, a.dataType, a.elementsSize());
21+
}
22+
23+
auto K::typeId() noexcept -> size_t {
24+
static uint8_t ID = 1;
25+
return reinterpret_cast<size_t>(&ID);
26+
}
27+
auto K::kernelTypeId() const noexcept -> size_t {
28+
return typeId();
29+
}
30+
auto K::description() const noexcept -> std::string_view {
31+
return "Performing hardsigmoid operation on Nvidia GPU";
32+
}
33+
34+
#ifdef USE_CUDA
35+
constexpr static const char *TEMPLATE = R"~(
36+
__device__ __forceinline__ static {0:} fn({0:} x) {{
37+
return {1:};
38+
}}
39+
40+
extern "C" __global__ void kernel(
41+
{0:} *__restrict__ y,
42+
{0:} const *__restrict__ x,
43+
size_t n
44+
) {{
45+
for (auto tid = blockIdx.x * blockDim.x + threadIdx.x,
46+
step = blockDim.x * gridDim.x;
47+
tid < n;
48+
tid += step)
49+
y[tid] = fn(x[tid]);
50+
}}
51+
)~";
52+
auto K::lower(Resources &res) const -> RoutineWorkspace {
53+
using namespace runtime;
54+
55+
std::string op = "";
56+
switch (dataType) {
57+
case DT::F32:
58+
op = fmt::format("fmaxf(0.f, fminf(1.f, fmaf({}, x, {})))", alpha, beta);
59+
break;
60+
case DT::F64:
61+
op = fmt::format("fmax(0.0, fmin(1.0, fma({}, x, {})))",
62+
static_cast<double>(alpha), static_cast<double>(beta));
63+
break;
64+
case DT::FP16:
65+
op = fmt::format("__hmax(CUDART_ZERO_FP16, __hmin(CUDART_ONE_FP16, (__float2half({}) * x + __float2half({}))))",
66+
alpha, beta);
67+
break;
68+
default:
69+
UNREACHABLE();
70+
}
71+
auto name = fmt::format("hardsigmoid_{}_{}_{}", dataType.name(), alpha, beta);
72+
auto code = fmt::format(TEMPLATE, nvrtc::dataType(dataType), op);
73+
return [h = nvrtc::Handler::compile(name.c_str(), code.c_str(), "kernel"),
74+
params = cuda::ThreadsDistributer()(size)](
75+
Resources &, void *, void const *const *inputs, void *const *outputs) {
76+
auto y = outputs[0];
77+
auto x = inputs[0];
78+
auto n = params.n;
79+
void *args[]{&y, &x, &n};
80+
h->launch(params.gridSize, 1, 1,
81+
params.blockSize, 1, 1,
82+
0, args);
83+
};
84+
}
85+
#endif
86+
87+
}// namespace refactor::kernel
88+
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#ifndef KERNEL_HARD_SIGMOID_CUDA_KERNEL_HH
2+
#define KERNEL_HARD_SIGMOID_CUDA_KERNEL_HH
3+
4+
#include "kernel/collectors/hard_sigmoid.h"
5+
#include "kernel/tensor.h"
6+
7+
namespace refactor::kernel {
8+
9+
struct HardSigmoidCuda final : public Kernel {
10+
float alpha, beta;
11+
DataType dataType;
12+
size_t size;
13+
14+
explicit HardSigmoidCuda(float, float, DataType, size_t) noexcept;
15+
16+
static KernelBox build(float, float, Tensor const &) noexcept;
17+
static size_t typeId() noexcept;
18+
19+
size_t kernelTypeId() const noexcept final;
20+
std::string_view description() const noexcept final;
21+
#ifdef USE_CUDA
22+
RoutineWorkspace lower(Resources &) const final;
23+
#endif
24+
};
25+
26+
}// namespace refactor::kernel
27+
28+
#endif// KERNEL_HARD_SIGMOID_CUDA_KERNEL_HH
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#include "../../../src/kernels/hard_sigmoid/cpu_kernel.hh"
2+
#include <gtest/gtest.h>
3+
4+
using namespace refactor;
5+
using namespace kernel;
6+
7+
TEST(kernel, HardSigmoidCpu) {
8+
// build routine
9+
auto dataTensor = Tensor::share(DataType::F32, Shape{2, 3, 5});
10+
float alpha = 0.2f, beta = 0.5f;
11+
auto kernel = HardSigmoidCpu::build(alpha, beta, *dataTensor);
12+
ASSERT_TRUE(kernel);
13+
auto res = runtime::Resources();
14+
auto routine = kernel->lower(res).routine;
15+
// put input data
16+
std::vector<float> result(dataTensor->elementsSize());
17+
for (auto i : range0_(result.size())) { result[i] = i; }
18+
// inference
19+
{
20+
void const *inputs[]{result.data()};
21+
void *outputs[]{result.data()};
22+
routine(res, nullptr, inputs, outputs);
23+
}
24+
std::vector<float> output = {0.5, 0.7, 0.9, 1., 1., 1., 1., 1., 1.,
25+
1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26+
1., 1., 1., 1., 1., 1.};
27+
// check
28+
for (auto i : range0_(result.size())) {
29+
EXPECT_FLOAT_EQ(output[i], result[i]);
30+
}
31+
}
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
#ifdef USE_CUDA
2+
3+
#include "../../../src/kernels/hard_sigmoid/cpu_kernel.hh"
4+
#include "../../../src/kernels/hard_sigmoid/cuda_kernel.hh"
5+
#include "hardware/device_manager.h"
6+
#include <gtest/gtest.h>
7+
8+
using namespace refactor;
9+
using namespace kernel;
10+
using namespace hardware;
11+
12+
TEST(kernel, HardSigmoidCuda) {
13+
// build routine
14+
auto dataTensor = Tensor::share(DataType::F32, Shape{2, 3, 5});
15+
float alpha = 0.2f, beta = 0.5f;
16+
auto kernel = HardSigmoidCuda::build(alpha, beta, *dataTensor);
17+
auto kCpu = HardSigmoidCpu::build(alpha, beta, *dataTensor);
18+
ASSERT_TRUE(kernel && kCpu);
19+
auto res = runtime::Resources();
20+
auto routine = kernel->lower(res).routine,
21+
rCpu = kCpu->lower(res).routine;
22+
// malloc
23+
auto &dev = *device::init(Device::Type::Nvidia, 0, "");
24+
auto gpuMem = dev.malloc(dataTensor->bytesSize());
25+
// put input data
26+
std::vector<float> data(dataTensor->elementsSize());
27+
for (auto i : range0_(data.size())) { data[i] = i; }
28+
gpuMem->copyFromHost(data.data(), dataTensor->bytesSize());
29+
// inference
30+
{
31+
void const *inputs[]{*gpuMem};
32+
void *outputs[]{*gpuMem};
33+
routine(res, nullptr, inputs, outputs);
34+
}
35+
{
36+
void const *inputs[]{data.data()};
37+
void *outputs[]{data.data()};
38+
rCpu(res, nullptr, inputs, outputs);
39+
}
40+
// take output data
41+
std::vector<float> result(dataTensor->elementsSize());
42+
gpuMem->copyToHost(result.data(), dataTensor->bytesSize());
43+
// check
44+
for (auto i : range0_(data.size())) {
45+
EXPECT_FLOAT_EQ(data[i], result[i]);
46+
}
47+
}
48+
49+
#endif
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#ifndef COMPUTATION_HARD_SIGMOID_H
2+
#define COMPUTATION_HARD_SIGMOID_H
3+
4+
#include "../operator.h"
5+
6+
namespace refactor::computation {
7+
8+
struct HardSigmoid final : public Operator {
9+
float alpha, beta;
10+
11+
constexpr HardSigmoid(float alpha_, float beta_) noexcept
12+
: Operator(), alpha(alpha_), beta(beta_){};
13+
14+
static size_t typeId() noexcept;
15+
size_t opTypeId() const noexcept final;
16+
std::string_view name() const noexcept final;
17+
kernel::CollectorBox candidateKernels(Target) const noexcept final;
18+
std::string serialize() const noexcept final;
19+
};
20+
21+
}// namespace refactor::computation
22+
23+
#endif// COMPUTATION_HARD_SIGMOID_H
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#include "computation/operators/hard_sigmoid.h"
2+
#include "kernel/collectors/hard_sigmoid.h"
3+
4+
namespace refactor::computation {
5+
using Op = HardSigmoid;
6+
7+
auto Op::typeId() noexcept -> size_t {
8+
static uint8_t ID = 1;
9+
return reinterpret_cast<size_t>(&ID);
10+
}
11+
auto Op::opTypeId() const noexcept -> size_t { return typeId(); }
12+
auto Op::name() const noexcept -> std::string_view { return "HardSigmoid"; }
13+
14+
auto Op::candidateKernels(Target target) const noexcept -> kernel::CollectorBox {
15+
using Collector_ = kernel::HardSigmoidCollector;
16+
return std::make_unique<Collector_>(target, alpha, beta);
17+
}
18+
auto Op::serialize() const noexcept -> std::string {
19+
return fmt::format("{}()", name());
20+
}
21+
22+
}// namespace refactor::computation
23+

0 commit comments

Comments
 (0)