Skip to content

Commit 1635fd9

Browse files
issue/440 feat: add softplus operator
1 parent 97f9ac7 commit 1635fd9

File tree

11 files changed

+522
-4
lines changed

11 files changed

+522
-4
lines changed

include/infiniop.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include "infiniop/ops/rms_norm.h"
1717
#include "infiniop/ops/rope.h"
1818
#include "infiniop/ops/rope_v2.h"
19+
#include "infiniop/ops/softplus.h"
1920
#include "infiniop/ops/sub.h"
2021
#include "infiniop/ops/swiglu.h"
2122
#include "infiniop/ops/topkrouter.h"

include/infiniop/ops/softplus.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#ifndef __INFINIOP_SOFTPLUS_API_H__
2+
#define __INFINIOP_SOFTPLUS_API_H__
3+
4+
#include "../operator_descriptor.h"
5+
6+
typedef struct InfiniopDescriptor *infiniopSoftplusDescriptor_t;
7+
8+
__C __export infiniStatus_t infiniopCreateSoftplusDescriptor(infiniopHandle_t handle,
9+
infiniopSoftplusDescriptor_t *desc_ptr,
10+
infiniopTensorDescriptor_t y,
11+
infiniopTensorDescriptor_t x);
12+
13+
__C __export infiniStatus_t infiniopGetSoftplusWorkspaceSize(infiniopSoftplusDescriptor_t desc, size_t *size);
14+
15+
__C __export infiniStatus_t infiniopSoftplus(infiniopSoftplusDescriptor_t desc,
16+
void *workspace,
17+
size_t workspace_size,
18+
void *y,
19+
const void *x,
20+
void *stream);
21+
22+
__C __export infiniStatus_t infiniopDestroySoftplusDescriptor(infiniopSoftplusDescriptor_t desc);
23+
24+
#endif

scripts/python_test.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ def run_tests(args):
2424
"rope.py",
2525
"sub.py",
2626
"swiglu.py",
27+
"softplus.py",
2728
]:
2829
result = subprocess.run(
2930
f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#include "softplus_cpu.h"
2+
3+
namespace op::softplus::cpu {
4+
5+
Descriptor::~Descriptor() = default;
6+
7+
infiniStatus_t Descriptor::create(
8+
infiniopHandle_t handle_,
9+
Descriptor **desc_ptr,
10+
infiniopTensorDescriptor_t out_desc,
11+
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
12+
13+
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
14+
auto dtype = out_desc->dtype();
15+
16+
const auto &x_desc = input_desc_vec.at(0);
17+
const auto &y_shape = out_desc->shape();
18+
const auto &x_shape = x_desc->shape();
19+
20+
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
21+
22+
CHECK_SAME_SHAPE(y_shape, x_shape);
23+
24+
// create CPU elementwise descriptor
25+
CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
26+
27+
return INFINI_STATUS_SUCCESS;
28+
}
29+
30+
infiniStatus_t Descriptor::calculate(
31+
void *workspace,
32+
size_t workspace_size,
33+
void *output,
34+
std::vector<const void *> inputs,
35+
void *stream) const {
36+
37+
switch (_dtype) {
38+
case INFINI_DTYPE_F16:
39+
return _device_info->calculate<SoftplusOp, fp16_t>(_info, output, inputs, stream);
40+
case INFINI_DTYPE_F32:
41+
return _device_info->calculate<SoftplusOp, float>(_info, output, inputs, stream);
42+
case INFINI_DTYPE_F64:
43+
return _device_info->calculate<SoftplusOp, double>(_info, output, inputs, stream);
44+
case INFINI_DTYPE_BF16:
45+
return _device_info->calculate<SoftplusOp, bf16_t>(_info, output, inputs, stream);
46+
default:
47+
return INFINI_STATUS_BAD_TENSOR_DTYPE;
48+
}
49+
50+
return INFINI_STATUS_SUCCESS;
51+
}
52+
} // namespace op::softplus::cpu
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#ifndef __SOFTPLUS_CPU_H__
2+
#define __SOFTPLUS_CPU_H__
3+
4+
#include "../../../elementwise/cpu/elementwise_cpu.h"
5+
6+
ELEMENTWISE_DESCRIPTOR(softplus, cpu)
7+
8+
namespace op::softplus::cpu {
9+
typedef struct SoftplusOp {
10+
public:
11+
static constexpr size_t num_inputs = 1;
12+
template <typename T>
13+
T operator()(const T &x) const {
14+
if (x > T(20)) {
15+
return x;
16+
} else {
17+
return std::log(T(1) + std::exp(x));
18+
}
19+
}
20+
} SoftplusOp;
21+
} // namespace op::softplus::cpu
22+
23+
#endif // __SOFTPLUS_CPU_H__
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#ifndef __SOFTPLUS_CUDA_H__
2+
#define __SOFTPLUS_CUDA_H__
3+
4+
namespace op::softplus::cuda {
5+
typedef struct SoftplusOp {
6+
public:
7+
static constexpr size_t num_inputs = 1;
8+
9+
template <typename T>
10+
__device__ __forceinline__ T operator()(const T &x) const {
11+
if constexpr (std::is_same_v<T, half>) {
12+
// promote to float for stability, then cast back
13+
float xf = __half2float(x);
14+
float out = (xf > 20.0f) ? xf : log1pf(expf(xf));
15+
return __float2half(out);
16+
} else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
17+
float xf = __bfloat162float(x);
18+
float out = (xf > 20.0f) ? xf : log1pf(expf(xf));
19+
return __float2bfloat16(out);
20+
} else if constexpr (std::is_same_v<T, half2>) {
21+
// process as two lanes
22+
float2 xf = __half22float2(x);
23+
xf.x = (xf.x > 20.0f) ? xf.x : log1pf(expf(xf.x));
24+
xf.y = (xf.y > 20.0f) ? xf.y : log1pf(expf(xf.y));
25+
return __floats2half2_rn(xf.x, xf.y);
26+
} else {
27+
// default: float, double, etc.
28+
return (x > T(20)) ? x : log1p(exp(x));
29+
}
30+
}
31+
} SoftplusOp;
32+
} // namespace op::softplus::cuda
33+
34+
#endif // __SOFTPLUS_CUDA_H__
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
2+
3+
#include "../cuda/kernel.cuh"
4+
#include "softplus_nvidia.cuh"
5+
6+
namespace op::softplus::nvidia {
7+
8+
Descriptor::~Descriptor() = default;
9+
10+
infiniStatus_t Descriptor::create(
11+
infiniopHandle_t handle_,
12+
Descriptor **desc_ptr,
13+
infiniopTensorDescriptor_t out_desc,
14+
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
15+
16+
auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
17+
auto dtype = out_desc->dtype();
18+
19+
const auto &x_desc = input_desc_vec.at(0);
20+
const auto &y_shape = out_desc->shape();
21+
const auto &x_shape = x_desc->shape();
22+
23+
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
24+
25+
CHECK_SAME_SHAPE(y_shape, x_shape);
26+
27+
// create CUDA elementwise descriptor
28+
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
29+
30+
return INFINI_STATUS_SUCCESS;
31+
}
32+
33+
infiniStatus_t Descriptor::calculate(
34+
void *workspace,
35+
size_t workspace_size,
36+
void *output,
37+
std::vector<const void *> inputs,
38+
void *stream) const {
39+
40+
if (workspace_size < _workspace_size) {
41+
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
42+
}
43+
44+
switch (_dtype) {
45+
case INFINI_DTYPE_F16:
46+
return _device_info->calculate<256, cuda::SoftplusOp, half>(_info, workspace, output, inputs, stream);
47+
case INFINI_DTYPE_BF16:
48+
return _device_info->calculate<256, cuda::SoftplusOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
49+
case INFINI_DTYPE_F32:
50+
return _device_info->calculate<256, cuda::SoftplusOp, float>(_info, workspace, output, inputs, stream);
51+
case INFINI_DTYPE_F64:
52+
return _device_info->calculate<256, cuda::SoftplusOp, double>(_info, workspace, output, inputs, stream);
53+
default:
54+
return INFINI_STATUS_BAD_TENSOR_DTYPE;
55+
}
56+
57+
return INFINI_STATUS_SUCCESS;
58+
}
59+
} // namespace op::softplus::nvidia
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#ifndef __SOFTPLUS_CUDA_API_H__
2+
#define __SOFTPLUS_CUDA_API_H__
3+
4+
#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
5+
6+
ELEMENTWISE_DESCRIPTOR(softplus, nvidia)
7+
8+
#endif // __SOFTPLUS_CUDA_API_H__
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
#include "../../operator.h"
2+
#include "../../handle.h"
3+
#include "infiniop/ops/softplus.h"
4+
5+
#ifdef ENABLE_CPU_API
6+
#include "cpu/softplus_cpu.h"
7+
#endif
8+
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
9+
#include "nvidia/softplus_nvidia.cuh"
10+
#endif
11+
#ifdef ENABLE_METAX_API
12+
#include "metax/softplus_metax.h"
13+
#endif
14+
15+
__C infiniStatus_t infiniopCreateSoftplusDescriptor(
16+
infiniopHandle_t handle,
17+
infiniopSoftplusDescriptor_t *desc_ptr,
18+
infiniopTensorDescriptor_t y_desc,
19+
infiniopTensorDescriptor_t x_desc) {
20+
21+
#define CREATE(CASE, NAMESPACE) \
22+
case CASE: \
23+
return op::softplus::NAMESPACE::Descriptor::create( \
24+
handle, \
25+
reinterpret_cast<op::softplus::NAMESPACE::Descriptor **>(desc_ptr), \
26+
y_desc, \
27+
{x_desc})
28+
29+
switch (handle->device) {
30+
31+
#ifdef ENABLE_CPU_API
32+
CREATE(INFINI_DEVICE_CPU, cpu);
33+
#endif
34+
#ifdef ENABLE_NVIDIA_API
35+
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
36+
#endif
37+
#ifdef ENABLE_ILUVATAR_API
38+
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
39+
#endif
40+
41+
default:
42+
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
43+
}
44+
45+
#undef CREATE
46+
}
47+
48+
__C infiniStatus_t infiniopGetSoftplusWorkspaceSize(infiniopSoftplusDescriptor_t desc, size_t *size) {
49+
50+
#define GET(CASE, NAMESPACE) \
51+
case CASE: \
52+
*size = reinterpret_cast<op::softplus::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
53+
return INFINI_STATUS_SUCCESS
54+
55+
switch (desc->device_type) {
56+
#ifdef ENABLE_CPU_API
57+
GET(INFINI_DEVICE_CPU, cpu);
58+
#endif
59+
#ifdef ENABLE_NVIDIA_API
60+
GET(INFINI_DEVICE_NVIDIA, nvidia);
61+
#endif
62+
#ifdef ENABLE_ILUVATAR_API
63+
GET(INFINI_DEVICE_ILUVATAR, nvidia);
64+
#endif
65+
66+
default:
67+
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
68+
}
69+
#undef GET
70+
71+
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
72+
}
73+
74+
__C infiniStatus_t infiniopSoftplus(
75+
infiniopSoftplusDescriptor_t desc,
76+
void *workspace,
77+
size_t workspace_size,
78+
void *y,
79+
const void *x,
80+
void *stream) {
81+
82+
#define CALCULATE(CASE, NAMESPACE) \
83+
case CASE: \
84+
return reinterpret_cast<const op::softplus::NAMESPACE::Descriptor *>(desc) \
85+
->calculate(workspace, workspace_size, y, {x}, stream)
86+
87+
switch (desc->device_type) {
88+
89+
#ifdef ENABLE_CPU_API
90+
CALCULATE(INFINI_DEVICE_CPU, cpu);
91+
#endif
92+
#ifdef ENABLE_NVIDIA_API
93+
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
94+
#endif
95+
#ifdef ENABLE_ILUVATAR_API
96+
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
97+
#endif
98+
99+
default:
100+
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
101+
}
102+
103+
#undef CALCULATE
104+
}
105+
106+
__C infiniStatus_t
107+
infiniopDestroySoftplusDescriptor(infiniopSoftplusDescriptor_t desc) {
108+
109+
#define DELETE(CASE, NAMESPACE) \
110+
case CASE: \
111+
delete reinterpret_cast<const op::softplus::NAMESPACE::Descriptor *>(desc); \
112+
return INFINI_STATUS_SUCCESS
113+
114+
switch (desc->device_type) {
115+
116+
#ifdef ENABLE_CPU_API
117+
DELETE(INFINI_DEVICE_CPU, cpu);
118+
#endif
119+
#ifdef ENABLE_NVIDIA_API
120+
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
121+
#endif
122+
#ifdef ENABLE_ILUVATAR_API
123+
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
124+
#endif
125+
126+
default:
127+
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
128+
}
129+
130+
#undef DELETE
131+
}

0 commit comments

Comments
 (0)