Sxy-17
diff --git a/‎src/infiniop/ops/add/moore/add_moore.h‎
Lines changed: 8 additions & 0 deletions b/‎src/infiniop/ops/add/moore/add_moore.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/infiniop/ops/add/moore/add_moore.mu‎
Lines changed: 66 additions & 0 deletions b/‎src/infiniop/ops/add/moore/add_moore.mu‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎src/infiniop/ops/add/moore/add_moore_kernel.h‎
Lines changed: 38 additions & 0 deletions b/‎src/infiniop/ops/add/moore/add_moore_kernel.h‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎src/infiniop/ops/add/operator.cc‎
Lines changed: 15 additions & 0 deletions b/‎src/infiniop/ops/add/operator.cc‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎src/infiniop/ops/conv/moore/conv_moore.h‎
Lines changed: 85 additions & 0 deletions b/‎src/infiniop/ops/conv/moore/conv_moore.h‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎src/infiniop/ops/conv/moore/conv_mudnn.h‎
Lines changed: 8 additions & 0 deletions b/‎src/infiniop/ops/conv/moore/conv_mudnn.h‎
Lines changed: 8 additions & 0 deletions
@@ -0,0 +1,8 @@
+#ifndef __ADD_MOORE_API_H__
+#define __ADD_MOORE_API_H__
+
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+ELEMENTWISE_DESCRIPTOR(add, moore)
+
+#endif // __ADD_MOORE_API_H__
@@ -0,0 +1,66 @@
+#include "add_moore.h"
+
+#include "../../../elementwise/moore/elementwise_moore.h"
+
+#include "add_moore_kernel.h"
+
+namespace op::add::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create MOORE elementwise descriptor
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, moore::AddOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, moore::AddOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, moore::AddOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, moore::AddOp, double>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<256, moore::AddOp, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<256, moore::AddOp, int64_t>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::add::moore
@@ -0,0 +1,38 @@
+#ifndef __ADD_MOORE_KERNEL_H__
+#define __ADD_MOORE_KERNEL_H__
+
+/*
+ * This file contains the Add operation implementation for the MUSA backend.
+ *
+ * It uses the 'op::add::cuda' namespace to maintain a consistent code structure
+ * and interface with the CUDA implementation, ensuring code alignment across different
+ * hardware platforms.
+ */
+
+namespace op::add::moore {
+typedef struct AddOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __hadd2(a, b);
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __hadd(a, b);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // On MUSA platform, convert to float, add, then convert back to avoid ambiguous conversion
+            // from int (returned by __hadd) to __mt_bfloat16
+            float a_f = __bfloat162float(a);
+            float b_f = __bfloat162float(b);
+            return __float2bfloat16_rn(a_f + b_f);
+        } else if constexpr (std::is_same_v<T, float>) {
+            // Use __fadd_rn instead of __fadd_rd for moore platform compatibility
+            return __fadd_rn(a, b);
+        } else {
+            return a + b;
+        }
+    }
+} AddOp;
+} // namespace op::add::moore
+
+#endif // __ADD_MOORE_KERNEL_H__
@@ -18,6 +18,9 @@
 #ifdef ENABLE_CAMBRICON_API
 #include "bang/add_bang.h"
 #endif
+#ifdef ENABLE_MOORE_API
+#include "moore/add_moore.h"
+#endif
 
 __C infiniStatus_t infiniopCreateAddDescriptor(
     infiniopHandle_t handle,
@@ -58,6 +61,9 @@ __C infiniStatus_t infiniopCreateAddDescriptor(
 #ifdef ENABLE_CAMBRICON_API
         CREATE(INFINI_DEVICE_CAMBRICON, bang);
 #endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
 
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -94,6 +100,9 @@ __C infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, siz
 #endif
 #ifdef ENABLE_CAMBRICON_API
         GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
 #endif
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -140,6 +149,9 @@ __C infiniStatus_t infiniopAdd(
 #ifdef ENABLE_CAMBRICON_API
         CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
 #endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
 
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -179,6 +191,9 @@ infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) {
 #ifdef ENABLE_CAMBRICON_API
         DELETE(INFINI_DEVICE_CAMBRICON, bang);
 #endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
 
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 
@@ -0,0 +1,85 @@
+#ifndef __CONV_MOORE_H__
+#define __CONV_MOORE_H__
+
+#include "conv_mudnn.h"
+
+namespace op::conv::moore {
+
+// Descriptor class for CONV operations on Moore devices.
+// This class acts as a wrapper to select mudnn backend.
+// It encapsulates the backend-specific Descriptor implementation and provides
+// a unified interface for workspace query and CONV calculation.
+class Descriptor final : public InfiniopDescriptor {
+public:
+    // Destructor: deletes the backend-specific descriptor.
+    ~Descriptor() {
+        delete reinterpret_cast<mudnn::Descriptor *>(_impl);
+    }
+
+    // Returns the required workspace size for the CONV operation.
+    size_t workspaceSize() const {
+        return reinterpret_cast<mudnn::Descriptor *>(_impl)->workspaceSize();
+    }
+
+    // Static factory method to create a Descriptor instance.
+    // This method chooses the backend (mudnn) and constructs
+    // the corresponding implementation internally.
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t w_desc,
+        infiniopTensorDescriptor_t b_desc,
+        const void *pads,
+        const void *strides,
+        const void *dilations,
+        size_t n) {
+        auto desc = new Descriptor(handle->device, handle->device_id);
+
+        // Backend selection strategy:
+        // Currently defaulting to MUDNN.
+        // Can be modified to choose based on environment variables or runtime parameters.
+        desc->_backend = Backend::MUDNN;
+
+        mudnn::Descriptor *impl;
+        auto status = mudnn::Descriptor::create(handle, &impl, y_desc, x_desc, w_desc, b_desc, pads, strides, dilations, n);
+        if (status != INFINI_STATUS_SUCCESS) {
+            delete desc;
+            return status;
+        }
+        desc->_impl = impl;
+
+        *desc_ptr = desc;
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    // Unified CONV calculation interface.
+    // Calls the corresponding backend's calculate function internally.
+    infiniStatus_t calculate(
+        void *workspace, size_t workspace_size,
+        void *y,
+        const void *x,
+        const void *w,
+        const void *bias,
+        void *stream) const {
+        return reinterpret_cast<mudnn::Descriptor *>(_impl)
+            ->calculate(workspace, workspace_size, y, x, w, bias, stream);
+    }
+
+private:
+    // Private constructor: ensures users cannot directly instantiate Descriptor.
+    // Instances must be created via the static create() factory method.
+    Descriptor(infiniDevice_t device_type, int device_id)
+        : InfiniopDescriptor{device_type, device_id}, _impl(nullptr) {}
+
+    // Enum to indicate which backend is being used internally.
+    enum class Backend { MUDNN };
+
+    Backend _backend; // Currently selected MUDNN backend
+    void *_impl;      // Pointer to backend-specific descriptor (mudnn::Descriptor*)
+};
+
+} // namespace op::conv::moore
+
+#endif // __CONV_MOORE_H__
@@ -0,0 +1,8 @@
+#ifndef __CONV_MUDNN_H__
+#define __CONV_MUDNN_H__
+
+#include "../conv.h"
+
+DESCRIPTOR(mudnn)
+
+#endif // __CONV_MUDNN_H__