Skip to content

Commit 5e58311

Browse files
authored
Merge branch 'Sxy-17:v0.1.0' into mooer/dev
2 parents d0930ee + 19f1322 commit 5e58311

32 files changed

+1281
-30
lines changed
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#ifndef __ADD_MOORE_API_H__
2+
#define __ADD_MOORE_API_H__
3+
4+
#include "../../../elementwise/moore/elementwise_moore_api.h"
5+
6+
ELEMENTWISE_DESCRIPTOR(add, moore)
7+
8+
#endif // __ADD_MOORE_API_H__
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
#include "add_moore.h"
2+
3+
#include "../../../elementwise/moore/elementwise_moore.h"
4+
5+
#include "add_moore_kernel.h"
6+
7+
namespace op::add::moore {
8+
9+
Descriptor::~Descriptor() = default;
10+
11+
infiniStatus_t Descriptor::create(
12+
infiniopHandle_t handle_,
13+
Descriptor **desc_ptr,
14+
infiniopTensorDescriptor_t out_desc,
15+
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
16+
17+
auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
18+
auto dtype = out_desc->dtype();
19+
20+
const auto &a_desc = input_desc_vec.at(0);
21+
const auto &b_desc = input_desc_vec.at(1);
22+
const auto &c_shape = out_desc->shape();
23+
const auto &a_shape = a_desc->shape();
24+
const auto &b_shape = b_desc->shape();
25+
26+
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
27+
28+
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
29+
30+
// create MOORE elementwise descriptor
31+
CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
32+
33+
return INFINI_STATUS_SUCCESS;
34+
}
35+
36+
infiniStatus_t Descriptor::calculate(
37+
void *workspace,
38+
size_t workspace_size,
39+
void *output,
40+
std::vector<const void *> inputs,
41+
void *stream) const {
42+
43+
if (workspace_size < _workspace_size) {
44+
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
45+
}
46+
47+
switch (_dtype) {
48+
case INFINI_DTYPE_F16:
49+
return _device_info->calculate<256, moore::AddOp, half>(_info, workspace, output, inputs, stream);
50+
case INFINI_DTYPE_BF16:
51+
return _device_info->calculate<256, moore::AddOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
52+
case INFINI_DTYPE_F32:
53+
return _device_info->calculate<256, moore::AddOp, float>(_info, workspace, output, inputs, stream);
54+
case INFINI_DTYPE_F64:
55+
return _device_info->calculate<256, moore::AddOp, double>(_info, workspace, output, inputs, stream);
56+
case INFINI_DTYPE_I32:
57+
return _device_info->calculate<256, moore::AddOp, int32_t>(_info, workspace, output, inputs, stream);
58+
case INFINI_DTYPE_I64:
59+
return _device_info->calculate<256, moore::AddOp, int64_t>(_info, workspace, output, inputs, stream);
60+
default:
61+
return INFINI_STATUS_BAD_TENSOR_DTYPE;
62+
}
63+
64+
return INFINI_STATUS_SUCCESS;
65+
}
66+
} // namespace op::add::moore
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#ifndef __ADD_MOORE_KERNEL_H__
2+
#define __ADD_MOORE_KERNEL_H__
3+
4+
/*
5+
* This file contains the Add operation implementation for the MUSA backend.
6+
*
7+
* It uses the 'op::add::cuda' namespace to maintain a consistent code structure
8+
* and interface with the CUDA implementation, ensuring code alignment across different
9+
* hardware platforms.
10+
*/
11+
12+
namespace op::add::moore {
13+
typedef struct AddOp {
14+
public:
15+
static constexpr size_t num_inputs = 2;
16+
template <typename T>
17+
__device__ __forceinline__ T operator()(const T &a, const T &b) const {
18+
if constexpr (std::is_same_v<T, half2>) {
19+
return __hadd2(a, b);
20+
} else if constexpr (std::is_same_v<T, half>) {
21+
return __hadd(a, b);
22+
} else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
23+
// On MUSA platform, convert to float, add, then convert back to avoid ambiguous conversion
24+
// from int (returned by __hadd) to __mt_bfloat16
25+
float a_f = __bfloat162float(a);
26+
float b_f = __bfloat162float(b);
27+
return __float2bfloat16_rn(a_f + b_f);
28+
} else if constexpr (std::is_same_v<T, float>) {
29+
// Use __fadd_rn instead of __fadd_rd for moore platform compatibility
30+
return __fadd_rn(a, b);
31+
} else {
32+
return a + b;
33+
}
34+
}
35+
} AddOp;
36+
} // namespace op::add::moore
37+
38+
#endif // __ADD_MOORE_KERNEL_H__

src/infiniop/ops/add/operator.cc

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@
1818
#ifdef ENABLE_CAMBRICON_API
1919
#include "bang/add_bang.h"
2020
#endif
21+
#ifdef ENABLE_MOORE_API
22+
#include "moore/add_moore.h"
23+
#endif
2124

2225
__C infiniStatus_t infiniopCreateAddDescriptor(
2326
infiniopHandle_t handle,
@@ -58,6 +61,9 @@ __C infiniStatus_t infiniopCreateAddDescriptor(
5861
#ifdef ENABLE_CAMBRICON_API
5962
CREATE(INFINI_DEVICE_CAMBRICON, bang);
6063
#endif
64+
#ifdef ENABLE_MOORE_API
65+
CREATE(INFINI_DEVICE_MOORE, moore);
66+
#endif
6167

6268
default:
6369
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -94,6 +100,9 @@ __C infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, siz
94100
#endif
95101
#ifdef ENABLE_CAMBRICON_API
96102
GET(INFINI_DEVICE_CAMBRICON, bang);
103+
#endif
104+
#ifdef ENABLE_MOORE_API
105+
GET(INFINI_DEVICE_MOORE, moore);
97106
#endif
98107
default:
99108
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -140,6 +149,9 @@ __C infiniStatus_t infiniopAdd(
140149
#ifdef ENABLE_CAMBRICON_API
141150
CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
142151
#endif
152+
#ifdef ENABLE_MOORE_API
153+
CALCULATE(INFINI_DEVICE_MOORE, moore);
154+
#endif
143155

144156
default:
145157
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -179,6 +191,9 @@ infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) {
179191
#ifdef ENABLE_CAMBRICON_API
180192
DELETE(INFINI_DEVICE_CAMBRICON, bang);
181193
#endif
194+
#ifdef ENABLE_MOORE_API
195+
DELETE(INFINI_DEVICE_MOORE, moore);
196+
#endif
182197

183198
default:
184199
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
#ifndef __CONV_MOORE_H__
2+
#define __CONV_MOORE_H__
3+
4+
#include "conv_mudnn.h"
5+
6+
namespace op::conv::moore {
7+
8+
// Descriptor class for CONV operations on Moore devices.
9+
// This class acts as a wrapper to select mudnn backend.
10+
// It encapsulates the backend-specific Descriptor implementation and provides
11+
// a unified interface for workspace query and CONV calculation.
12+
class Descriptor final : public InfiniopDescriptor {
13+
public:
14+
// Destructor: deletes the backend-specific descriptor.
15+
~Descriptor() {
16+
delete reinterpret_cast<mudnn::Descriptor *>(_impl);
17+
}
18+
19+
// Returns the required workspace size for the CONV operation.
20+
size_t workspaceSize() const {
21+
return reinterpret_cast<mudnn::Descriptor *>(_impl)->workspaceSize();
22+
}
23+
24+
// Static factory method to create a Descriptor instance.
25+
// This method chooses the backend (mudnn) and constructs
26+
// the corresponding implementation internally.
27+
static infiniStatus_t create(
28+
infiniopHandle_t handle,
29+
Descriptor **desc_ptr,
30+
infiniopTensorDescriptor_t y_desc,
31+
infiniopTensorDescriptor_t x_desc,
32+
infiniopTensorDescriptor_t w_desc,
33+
infiniopTensorDescriptor_t b_desc,
34+
const void *pads,
35+
const void *strides,
36+
const void *dilations,
37+
size_t n) {
38+
auto desc = new Descriptor(handle->device, handle->device_id);
39+
40+
// Backend selection strategy:
41+
// Currently defaulting to MUDNN.
42+
// Can be modified to choose based on environment variables or runtime parameters.
43+
desc->_backend = Backend::MUDNN;
44+
45+
mudnn::Descriptor *impl;
46+
auto status = mudnn::Descriptor::create(handle, &impl, y_desc, x_desc, w_desc, b_desc, pads, strides, dilations, n);
47+
if (status != INFINI_STATUS_SUCCESS) {
48+
delete desc;
49+
return status;
50+
}
51+
desc->_impl = impl;
52+
53+
*desc_ptr = desc;
54+
return INFINI_STATUS_SUCCESS;
55+
}
56+
57+
// Unified CONV calculation interface.
58+
// Calls the corresponding backend's calculate function internally.
59+
infiniStatus_t calculate(
60+
void *workspace, size_t workspace_size,
61+
void *y,
62+
const void *x,
63+
const void *w,
64+
const void *bias,
65+
void *stream) const {
66+
return reinterpret_cast<mudnn::Descriptor *>(_impl)
67+
->calculate(workspace, workspace_size, y, x, w, bias, stream);
68+
}
69+
70+
private:
71+
// Private constructor: ensures users cannot directly instantiate Descriptor.
72+
// Instances must be created via the static create() factory method.
73+
Descriptor(infiniDevice_t device_type, int device_id)
74+
: InfiniopDescriptor{device_type, device_id}, _impl(nullptr) {}
75+
76+
// Enum to indicate which backend is being used internally.
77+
enum class Backend { MUDNN };
78+
79+
Backend _backend; // Currently selected MUDNN backend
80+
void *_impl; // Pointer to backend-specific descriptor (mudnn::Descriptor*)
81+
};
82+
83+
} // namespace op::conv::moore
84+
85+
#endif // __CONV_MOORE_H__
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#ifndef __CONV_MUDNN_H__
2+
#define __CONV_MUDNN_H__
3+
4+
#include "../conv.h"
5+
6+
DESCRIPTOR(mudnn)
7+
8+
#endif // __CONV_MUDNN_H__

0 commit comments

Comments
 (0)