Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions include/infiniop.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "infiniop/ops/conv.h"
#include "infiniop/ops/dequantize_awq.h"
#include "infiniop/ops/gemm.h"
#include "infiniop/ops/layer_norm.h"
#include "infiniop/ops/mul.h"
#include "infiniop/ops/random_sample.h"
#include "infiniop/ops/rearrange.h"
Expand All @@ -21,5 +22,11 @@
#include "infiniop/ops/swiglu.h"
#include "infiniop/ops/topkrouter.h"
#include "infiniop/tensor_descriptor.h"
#include "infiniop/ops/softmax.h"
#include "infiniop/ops/sigmoid.h"
#include "infiniop/ops/gelu.h"
#include "infiniop/ops/tanh.h"
#include "infiniop/ops/quickgelu.h"
#include "infiniop/ops/gelutanh.h"

#endif // __INFINIOP_API_H__
24 changes: 24 additions & 0 deletions include/infiniop/ops/gelu.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#ifndef __INFINIOP_GELU_API_H__
#define __INFINIOP_GELU_API_H__

#include "../operator_descriptor.h"

typedef struct InfiniopDescriptor *infiniopGeluDescriptor_t;

__C __export infiniStatus_t infiniopCreateGeluDescriptor(infiniopHandle_t handle,
infiniopGeluDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t output,
infiniopTensorDescriptor_t intput);

__C __export infiniStatus_t infiniopGetGeluWorkspaceSize(infiniopGeluDescriptor_t desc, size_t *size);

__C __export infiniStatus_t infiniopGelu(infiniopGeluDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *output,
const void *intput,
void *stream);

__C __export infiniStatus_t infiniopDestroyGeluDescriptor(infiniopGeluDescriptor_t desc);

#endif
43 changes: 43 additions & 0 deletions include/infiniop/ops/gelutanh.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#ifndef __INFINIOP_GELUTANH_API_H__
#define __INFINIOP_GELUTANH_API_H__

#include "../operator_descriptor.h"

typedef struct InfiniopDescriptor *infiniopGeluTanhDescriptor_t;

/**
* Create GELU-Tanh descriptor
*
* y = x * 0.5 * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
*/
__C __export infiniStatus_t infiniopCreateGeluTanhDescriptor(
infiniopHandle_t handle,
infiniopGeluTanhDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x);

/**
* Query workspace size
*/
__C __export infiniStatus_t infiniopGetGeluTanhWorkspaceSize(
infiniopGeluTanhDescriptor_t desc,
size_t *size);

/**
* Launch GELU-Tanh operator
*/
__C __export infiniStatus_t infiniopGeluTanh(
infiniopGeluTanhDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
void *stream);

/**
* Destroy descriptor
*/
__C __export infiniStatus_t infiniopDestroyGeluTanhDescriptor(
infiniopGeluTanhDescriptor_t desc);

#endif // __INFINIOP_GELUTANH_API_H__
34 changes: 34 additions & 0 deletions include/infiniop/ops/layer_norm.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#ifndef __INFINIOP_LAYER_NORM_API_H__
#define __INFINIOP_LAYER_NORM_API_H__

#include "../operator_descriptor.h"

typedef struct InfiniopDescriptor *infiniopLayerNormDescriptor_t;

__C __export infiniStatus_t infiniopCreateLayerNormDescriptor(
infiniopHandle_t handle,
infiniopLayerNormDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_standardization_desc,
infiniopTensorDescriptor_t input_std_deviation_desc,
infiniopTensorDescriptor_t input_desc,
infiniopTensorDescriptor_t weight_desc,
infiniopTensorDescriptor_t bias_desc,
float eps);

__C __export infiniStatus_t infiniopGetLayerNormWorkspaceSize(infiniopLayerNormDescriptor_t desc, size_t *size);

__C __export infiniStatus_t infiniopLayerNorm(infiniopLayerNormDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *output,
void *input_standardization,
void *input_std_deviation,
const void *input,
const void *weight,
const void *bias,
void *stream);

__C __export infiniStatus_t infiniopDestroyLayerNormDescriptor(infiniopLayerNormDescriptor_t desc);

#endif
42 changes: 42 additions & 0 deletions include/infiniop/ops/quickgelu.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#ifndef __INFINIOP_QUICKGELU_API_H__
#define __INFINIOP_QUICKGELU_API_H__

#include "../operator_descriptor.h"

typedef struct InfiniopDescriptor *infiniopQuickGeluDescriptor_t;

/**
* Create QuickGELU descriptor
* y = x * sigmoid(1.702 * x)
*/
__C __export infiniStatus_t infiniopCreateQuickGeluDescriptor(
infiniopHandle_t handle,
infiniopQuickGeluDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x);

/**
* Query workspace size
*/
__C __export infiniStatus_t infiniopGetQuickGeluWorkspaceSize(
infiniopQuickGeluDescriptor_t desc,
size_t *size);

/**
* Launch QuickGELU operator
*/
__C __export infiniStatus_t infiniopQuickGelu(
infiniopQuickGeluDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
void *stream);

/**
* Destroy descriptor
*/
__C __export infiniStatus_t infiniopDestroyQuickGeluDescriptor(
infiniopQuickGeluDescriptor_t desc);

#endif
2 changes: 2 additions & 0 deletions include/infiniop/ops/relu.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ __C __export infiniStatus_t infiniopCreateReluDescriptor(infiniopHandle_t handle
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x);

__C infiniStatus_t infiniopGetReluWorkspaceSize(infiniopReluDescriptor_t desc, size_t *size);

__C __export infiniStatus_t infiniopRelu(infiniopReluDescriptor_t desc,
void *workspace,
size_t workspace_size,
Expand Down
24 changes: 24 additions & 0 deletions include/infiniop/ops/sigmoid.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#ifndef __INFINIOP_SIGMOID_API_H__
#define __INFINIOP_SIGMOID_API_H__

#include "../operator_descriptor.h"

typedef struct InfiniopDescriptor *infiniopSigmoidDescriptor_t;

__C __export infiniStatus_t infiniopCreateSigmoidDescriptor(infiniopHandle_t handle,
infiniopSigmoidDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x);

__C __export infiniStatus_t infiniopGetSigmoidWorkspaceSize(infiniopSigmoidDescriptor_t desc, size_t *size);

__C __export infiniStatus_t infiniopSigmoid(infiniopSigmoidDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
void *stream);

__C __export infiniStatus_t infiniopDestroySigmoidDescriptor(infiniopSigmoidDescriptor_t desc);

#endif
27 changes: 27 additions & 0 deletions include/infiniop/ops/softmax.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#ifndef __INFINIOP_SOFTMAX_API_H__
#define __INFINIOP_SOFTMAX_API_H__

#include "../operator_descriptor.h"

typedef struct InfiniopDescriptor *infiniopSoftmaxDescriptor_t;

__C __export infiniStatus_t infiniopCreateSoftmaxDescriptor(
infiniopHandle_t handle,
infiniopSoftmaxDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
int axis);

__C __export infiniStatus_t infiniopGetSoftmaxWorkspaceSize(infiniopSoftmaxDescriptor_t desc, size_t *size);

__C __export infiniStatus_t infiniopSoftmax(
infiniopSoftmaxDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
void *stream);

__C __export infiniStatus_t infiniopDestroySoftmaxDescriptor(infiniopSoftmaxDescriptor_t desc);

#endif
24 changes: 24 additions & 0 deletions include/infiniop/ops/tanh.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#ifndef __INFINIOP_TANH_API_H__
#define __INFINIOP_TANH_API_H__

#include "../operator_descriptor.h"

typedef struct InfiniopDescriptor *infiniopTanhDescriptor_t;

__C __export infiniStatus_t infiniopCreateTanhDescriptor(infiniopHandle_t handle,
infiniopTanhDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t output,
infiniopTensorDescriptor_t input);

__C __export infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, size_t *size);

__C __export infiniStatus_t infiniopTanh(infiniopTanhDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *output,
const void *input,
void *stream);

__C __export infiniStatus_t infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc);

#endif
8 changes: 8 additions & 0 deletions src/infiniop/ops/add/moore/add_moore.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#ifndef __ADD_MOORE_API_H__
#define __ADD_MOORE_API_H__

#include "../../../elementwise/moore/elementwise_moore_api.h"

ELEMENTWISE_DESCRIPTOR(add, moore)

#endif // __ADD_MOORE_API_H__
66 changes: 66 additions & 0 deletions src/infiniop/ops/add/moore/add_moore.mu
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#include "add_moore.h"

#include "../../../elementwise/moore/elementwise_moore.h"

#include "add_moore_kernel.h"

namespace op::add::moore {

Descriptor::~Descriptor() = default;

infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {

auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
auto dtype = out_desc->dtype();

const auto &a_desc = input_desc_vec.at(0);
const auto &b_desc = input_desc_vec.at(1);
const auto &c_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();

CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, INFINI_DTYPE_I32, INFINI_DTYPE_I64);

CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);

// create MOORE elementwise descriptor
CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)

return INFINI_STATUS_SUCCESS;
}

infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {

if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}

switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, moore::AddOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, moore::AddOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, moore::AddOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, moore::AddOp, double>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I32:
return _device_info->calculate<256, moore::AddOp, int32_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I64:
return _device_info->calculate<256, moore::AddOp, int64_t>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}

return INFINI_STATUS_SUCCESS;
}
} // namespace op::add::moore
38 changes: 38 additions & 0 deletions src/infiniop/ops/add/moore/add_moore_kernel.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#ifndef __ADD_MOORE_KERNEL_H__
#define __ADD_MOORE_KERNEL_H__

/*
* This file contains the Add operation implementation for the MUSA backend.
*
* It uses the 'op::add::cuda' namespace to maintain a consistent code structure
* and interface with the CUDA implementation, ensuring code alignment across different
* hardware platforms.
*/

namespace op::add::moore {
typedef struct AddOp {
public:
static constexpr size_t num_inputs = 2;
template <typename T>
__device__ __forceinline__ T operator()(const T &a, const T &b) const {
if constexpr (std::is_same_v<T, half2>) {
return __hadd2(a, b);
} else if constexpr (std::is_same_v<T, half>) {
return __hadd(a, b);
} else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
// On MUSA platform, convert to float, add, then convert back to avoid ambiguous conversion
// from int (returned by __hadd) to __mt_bfloat16
float a_f = __bfloat162float(a);
float b_f = __bfloat162float(b);
return __float2bfloat16_rn(a_f + b_f);
} else if constexpr (std::is_same_v<T, float>) {
// Use __fadd_rn instead of __fadd_rd for moore platform compatibility
return __fadd_rn(a, b);
} else {
return a + b;
}
}
} AddOp;
} // namespace op::add::moore

#endif // __ADD_MOORE_KERNEL_H__
Loading