Skip to content

Commit f38ea0d

Browse files
committed
[T1-1-1]: Hardswish operator with cpu nvidia metax iluvatar and test
1 parent 8bb2121 commit f38ea0d

File tree

11 files changed

+720
-0
lines changed

11 files changed

+720
-0
lines changed

include/infiniop/ops/hardswish.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#ifndef __INFINIOP_HARDSWISH_API_H__
2+
#define __INFINIOP_HARDSWISH_API_H__
3+
4+
#include "../operator_descriptor.h"
5+
6+
typedef struct InfiniopDescriptor *infiniopHardswishDescriptor_t;
7+
8+
__C __export infiniStatus_t infiniopCreateHardswishDescriptor(infiniopHandle_t handle,
9+
infiniopHardswishDescriptor_t *desc_ptr,
10+
infiniopTensorDescriptor_t output,
11+
infiniopTensorDescriptor_t input);
12+
13+
__C __export infiniStatus_t infiniopGetHardswishWorkspaceSize(infiniopHardswishDescriptor_t desc, size_t *size);
14+
15+
__C __export infiniStatus_t infiniopHardswish(infiniopHardswishDescriptor_t desc,
16+
void *workspace,
17+
size_t workspace_size,
18+
void *output,
19+
const void *input,
20+
void *stream);
21+
22+
__C __export infiniStatus_t infiniopDestroyHardswishDescriptor(infiniopHardswishDescriptor_t desc);
23+
24+
#endif
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
#include "ops.hpp"
2+
#include "utils.hpp"
3+
#include <infinirt.h>
4+
#include <iomanip>
5+
#include <iostream>
6+
7+
namespace infiniop_test::hardswish {
8+
struct Test::Attributes {
9+
std::shared_ptr<Tensor> input;
10+
std::shared_ptr<Tensor> output;
11+
std::shared_ptr<Tensor> ans;
12+
};
13+
14+
std::shared_ptr<Test> Test::build(
15+
std::unordered_map<std::string, std::vector<uint8_t>> attributes,
16+
std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
17+
double rtol, double atol) {
18+
auto test = std::shared_ptr<Test>(new Test(rtol, atol));
19+
test->_attributes = new Attributes();
20+
if (tensors.find("input") == tensors.end()
21+
|| tensors.find("output") == tensors.end()
22+
|| tensors.find("ans") == tensors.end()) {
23+
throw std::runtime_error("Invalid Test");
24+
}
25+
26+
test->_attributes->input = tensors["input"];
27+
test->_attributes->output = tensors["output"];
28+
test->_attributes->ans = tensors["ans"];
29+
30+
auto elemType = test->_attributes->input->ggml_type();
31+
if (elemType == GGML_TYPE_BF16) {
32+
test->_rtol = 1e-2;
33+
test->_atol = 1e-2;
34+
}
35+
if (elemType == GGML_TYPE_F16) {
36+
test->_rtol = 1e-3;
37+
test->_atol = 1e-3;
38+
}
39+
if (elemType == GGML_TYPE_F32) {
40+
test->_rtol = 1e-6;
41+
test->_atol = 1e-6;
42+
}
43+
44+
return test;
45+
}
46+
47+
std::shared_ptr<infiniop_test::Result> Test::run(
48+
infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
49+
infiniopHardswishDescriptor_t op_desc;
50+
auto input = _attributes->input->to(device, device_id);
51+
auto output = _attributes->output->to(device, device_id);
52+
CHECK_OR(infiniopCreateHardswishDescriptor(handle, &op_desc,
53+
output->desc(),
54+
input->desc()),
55+
return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
56+
size_t workspace_size;
57+
CHECK_OR(infiniopGetHardswishWorkspaceSize(op_desc, &workspace_size),
58+
return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
59+
void *workspace;
60+
CHECK_OR(infinirtMalloc(&workspace, workspace_size),
61+
return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
62+
CHECK_OR(infiniopHardswish(op_desc, workspace, workspace_size,
63+
output->data(),
64+
input->data(),
65+
nullptr),
66+
return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
67+
68+
try {
69+
allClose(output, _attributes->ans, _rtol, _atol);
70+
} catch (const std::exception &e) {
71+
return TEST_FAILED(RESULT_INCORRECT, e.what());
72+
}
73+
74+
double elapsed_time = 0.;
75+
76+
elapsed_time = benchmark(
77+
[=]() {
78+
infiniopHardswish(
79+
op_desc, workspace, workspace_size,
80+
output->data(),
81+
input->data(),
82+
nullptr);
83+
},
84+
warm_ups, iterations);
85+
86+
return TEST_PASSED(elapsed_time);
87+
}
88+
89+
std::vector<std::string> Test::attribute_names() {
90+
return {};
91+
}
92+
93+
std::vector<std::string> Test::tensor_names() {
94+
return {"input", "output", "ans"};
95+
}
96+
97+
std::vector<std::string> Test::output_names() {
98+
return {"output"};
99+
}
100+
101+
std::string Test::toString() const {
102+
std::ostringstream oss;
103+
oss << op_name() << std::endl;
104+
oss << "- input: " << _attributes->input->info() << std::endl;
105+
oss << "- output: " << _attributes->output->info() << std::endl;
106+
oss << std::scientific << std::setprecision(2);
107+
oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
108+
return oss.str();
109+
}
110+
111+
Test::~Test() {
112+
delete _attributes;
113+
}
114+
} // namespace infiniop_test::hardswish
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#include "hardswish_cpu.h"
2+
3+
namespace op::hardswish::cpu {
4+
5+
Descriptor::~Descriptor() = default;
6+
7+
infiniStatus_t Descriptor::create(
8+
infiniopHandle_t handle_,
9+
Descriptor **desc_ptr,
10+
infiniopTensorDescriptor_t out_desc,
11+
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
12+
13+
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
14+
auto dtype = out_desc->dtype();
15+
16+
const auto &input_desc = input_desc_vec.at(0);
17+
const auto &output_shape = out_desc->shape();
18+
const auto &input_shape = input_desc->shape();
19+
20+
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
21+
22+
CHECK_SAME_SHAPE(output_shape, input_shape);
23+
24+
// create CPU elementwise descriptor
25+
CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
26+
27+
return INFINI_STATUS_SUCCESS;
28+
}
29+
30+
infiniStatus_t Descriptor::calculate(
31+
void *workspace,
32+
size_t workspace_size,
33+
void *output,
34+
std::vector<const void *> inputs,
35+
void *stream) const {
36+
37+
switch (_dtype) {
38+
case INFINI_DTYPE_F16:
39+
return _device_info->calculate<HardswishOp, fp16_t>(_info, output, inputs, stream);
40+
case INFINI_DTYPE_F32:
41+
return _device_info->calculate<HardswishOp, float>(_info, output, inputs, stream);
42+
case INFINI_DTYPE_F64:
43+
return _device_info->calculate<HardswishOp, double>(_info, output, inputs, stream);
44+
case INFINI_DTYPE_BF16:
45+
return _device_info->calculate<HardswishOp, bf16_t>(_info, output, inputs, stream);
46+
default:
47+
return INFINI_STATUS_BAD_TENSOR_DTYPE;
48+
}
49+
50+
return INFINI_STATUS_SUCCESS;
51+
}
52+
} // namespace op::hardswish::cpu
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#ifndef __HARDSWISH_CPU_H__
2+
#define __HARDSWISH_CPU_H__
3+
4+
#include <algorithm>
5+
#include "../../../elementwise/cpu/elementwise_cpu.h"
6+
7+
ELEMENTWISE_DESCRIPTOR(hardswish, cpu)
8+
9+
namespace op::hardswish::cpu {
10+
typedef struct HardswishOp {
11+
public:
12+
static constexpr size_t num_inputs = 1;
13+
14+
template <typename T>
15+
T operator()(const T &input) const {
16+
if constexpr (std::is_integral_v<T>) {
17+
return static_cast<T>(0);
18+
} else {
19+
// x * clamp(x + 3, 0, 6) / 6
20+
auto x = static_cast<double>(input);
21+
double y = x + 3.0;
22+
y = std::min(std::max(y, 0.0), 6.0);
23+
double out = x * (y / 6.0);
24+
return static_cast<T>(out);
25+
}
26+
}
27+
} HardswishOp;
28+
} // namespace op::hardswish::cpu
29+
30+
#endif // __HARDSWISH_CPU_H__
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
#ifndef __HARDSWISH_CUDA_H__
2+
#define __HARDSWISH_CUDA_H__
3+
4+
#include <cuda_fp16.h>
5+
#include <cuda_bf16.h>
6+
#include <cmath>
7+
8+
namespace op::hardswish::cuda {
9+
10+
typedef struct HardswishOp {
11+
static constexpr size_t num_inputs = 1;
12+
13+
// Hardswish: f(x) = x * clamp(x + 3, 0, 6) / 6
14+
__device__ __forceinline__ float hswish_f32(float x) const {
15+
float y = x + 3.0f;
16+
y = y < 0.0f ? 0.0f : (y > 6.0f ? 6.0f : y);
17+
return x * (y * (1.0f / 6.0f));
18+
}
19+
20+
template <typename T>
21+
__device__ __forceinline__ T operator()(const T &input) const {
22+
if constexpr (std::is_same_v<T, half2>) {
23+
float2 vf = __half22float2(input);
24+
float2 vr = make_float2(
25+
hswish_f32(vf.x),
26+
hswish_f32(vf.y)
27+
);
28+
return __float22half2_rn(vr);
29+
} else if constexpr (std::is_same_v<T, half>) {
30+
float xf = __half2float(input);
31+
float yf = hswish_f32(xf);
32+
return __float2half_rn(yf);
33+
} else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
34+
float f0 = __bfloat162float(__low2bfloat16(input));
35+
float f1 = __bfloat162float(__high2bfloat16(input));
36+
return __floats2bfloat162_rn(hswish_f32(f0), hswish_f32(f1));
37+
} else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
38+
float xf = __bfloat162float(input);
39+
return __float2bfloat16_rz(hswish_f32(xf));
40+
} else if constexpr (std::is_same_v<T, float>) {
41+
return hswish_f32(input);
42+
} else if constexpr (std::is_same_v<T, double>) {
43+
double xd = static_cast<double>(input);
44+
double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0);
45+
return static_cast<T>(yd);
46+
} else {
47+
double xd = static_cast<double>(input);
48+
double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0);
49+
return static_cast<T>(yd);
50+
}
51+
}
52+
} HardswishOp;
53+
54+
} // namespace op::hardswish::cuda
55+
56+
#endif // __HARDSWISH_CUDA_H__
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#ifndef __HARDSWISH_METAX_API_H__
2+
#define __HARDSWISH_METAX_API_H__
3+
4+
#include "../../../elementwise/metax/elementwise_metax_api.h"
5+
6+
ELEMENTWISE_DESCRIPTOR(hardswish, metax)
7+
8+
#endif // __HARDSWISH_METAX_API_H__
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#include "hardswish_metax.h"
2+
3+
#include "../../../elementwise/metax/elementwise_metax.h"
4+
5+
#include "../cuda/kernel.cuh"
6+
7+
namespace op::hardswish::metax {
8+
9+
Descriptor::~Descriptor() = default;
10+
11+
infiniStatus_t Descriptor::create(
12+
infiniopHandle_t handle_,
13+
Descriptor **desc_ptr,
14+
infiniopTensorDescriptor_t out_desc,
15+
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
16+
17+
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
18+
auto dtype = out_desc->dtype();
19+
20+
const auto &input_desc = input_desc_vec.at(0);
21+
const auto &output_shape = out_desc->shape();
22+
const auto &input_shape = input_desc->shape();
23+
24+
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
25+
26+
CHECK_SAME_SHAPE(output_shape, input_shape);
27+
28+
// create CUDA elementwise descriptor
29+
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
30+
31+
return INFINI_STATUS_SUCCESS;
32+
}
33+
34+
infiniStatus_t Descriptor::calculate(
35+
void *workspace,
36+
size_t workspace_size,
37+
void *output,
38+
std::vector<const void *> inputs,
39+
void *stream) const {
40+
41+
if (workspace_size < _workspace_size) {
42+
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
43+
}
44+
45+
switch (_dtype) {
46+
case INFINI_DTYPE_F16:
47+
return _device_info->calculate<256, cuda::HardswishOp, half>(_info, workspace, output, inputs, stream);
48+
case INFINI_DTYPE_BF16:
49+
return _device_info->calculate<256, cuda::HardswishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
50+
case INFINI_DTYPE_F32:
51+
return _device_info->calculate<256, cuda::HardswishOp, float>(_info, workspace, output, inputs, stream);
52+
case INFINI_DTYPE_F64:
53+
return _device_info->calculate<256, cuda::HardswishOp, double>(_info, workspace, output, inputs, stream);
54+
default:
55+
return INFINI_STATUS_BAD_TENSOR_DTYPE;
56+
}
57+
58+
return INFINI_STATUS_SUCCESS;
59+
}
60+
} // namespace op::hardswish::metax

0 commit comments

Comments
 (0)