InfiniTensor
diff --git a/‎src/infiniccl/cambricon/infiniccl_cambricon.cc‎
Lines changed: 102 additions & 0 deletions b/‎src/infiniccl/cambricon/infiniccl_cambricon.cc‎
Lines changed: 102 additions & 0 deletions
diff --git a/‎src/infiniccl/cambricon/infiniccl_cambricon.h‎
Lines changed: 12 additions & 0 deletions b/‎src/infiniccl/cambricon/infiniccl_cambricon.h‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/infiniccl/infiniccl.cc‎
Lines changed: 4 additions & 0 deletions b/‎src/infiniccl/infiniccl.cc‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/infiniop-test/src/main.cpp‎
Lines changed: 1 addition & 1 deletion b/‎src/infiniop-test/src/main.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/infiniop/devices/bang/bang_kernel_common.h‎
Lines changed: 15 additions & 0 deletions b/‎src/infiniop/devices/bang/bang_kernel_common.h‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎src/infiniop/devices/kunlun/kunlun_kernel_common.h‎
Lines changed: 83 additions & 11 deletions b/‎src/infiniop/devices/kunlun/kunlun_kernel_common.h‎
Lines changed: 83 additions & 11 deletions
diff --git a/‎src/infiniop/devices/moore/moore_kernel_common.h‎
Lines changed: 7 additions & 2 deletions b/‎src/infiniop/devices/moore/moore_kernel_common.h‎
Lines changed: 7 additions & 2 deletions
@@ -0,0 +1,102 @@
+#include "infiniccl_cambricon.h"
+
+#include "../../utils.h"
+#include <cncl.h>
+#include <cnrt.h>
+#include <iostream>
+#include <vector>
+
+#define CHECK_CNCL(API__) CHECK_INTERNAL(API__, CNCL_RET_SUCCESS)
+
+inline cnrtQueue_t getCambriconStream(infinirtStream_t stream) {
+    if (stream == nullptr) {
+        return (cnrtQueue_t)(0);
+    }
+    return static_cast<cnrtQueue_t>(stream);
+}
+
+inline cnclComm_t getCnclComm(infinicclComm_t comm) {
+    return static_cast<cnclComm_t>(comm->comm);
+}
+
+inline cnclDataType_t getCnclDtype(infiniDtype_t datatype) {
+    switch (datatype) {
+    case INFINI_DTYPE_F32:
+        return cnclFloat32;
+    case INFINI_DTYPE_F16:
+        return cnclFloat16;
+    default:
+        std::cerr << "Unsupported data type: " << datatype << std::endl;
+        std::abort();
+        return cnclFloat16;
+    }
+}
+
+inline cnclReduceOp_t getCnclRedOp(infinicclReduceOp_t op) {
+    switch (op) {
+    case INFINICCL_SUM:
+        return cnclSum;
+    case INFINICCL_PROD:
+        return cnclProd;
+    case INFINICCL_MAX:
+        return cnclMax;
+    case INFINICCL_MIN:
+        return cnclMin;
+    default:
+        std::abort();
+        return cnclSum;
+    }
+}
+
+namespace infiniccl::cambricon {
+
+infiniStatus_t commInitAll(
+    infinicclComm_t *comms,
+    int ndevice,
+    const int *device_ids) {
+
+    std::vector<cnclComm_t> cncl_comms(ndevice);
+    std::vector<int> rank_list(ndevice);
+
+    for (int i = 0; i < ndevice; i++) {
+        rank_list[i] = i;
+        CHECK_INTERNAL(cnrtSetDevice(device_ids[i]), CNRT_RET_SUCCESS);
+    }
+
+    CHECK_CNCL(cnclInitComms(cncl_comms.data(), ndevice,
+                             (int const *)device_ids, rank_list.data(),
+                             ndevice, nullptr));
+
+    for (int i = 0; i < ndevice; i++) {
+        comms[i] = new InfinicclComm{INFINI_DEVICE_CAMBRICON, device_ids[i], (void *)(cncl_comms[i])};
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t commDestroy(infinicclComm_t comm) {
+    CHECK_CNCL(cnclFreeComm(getCnclComm(comm)));
+    delete comm;
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t allReduce(
+    void *sendbuf,
+    void *recvbuf,
+    size_t count,
+    infiniDtype_t datatype,
+    infinicclReduceOp_t op,
+    infinicclComm_t comm,
+    infinirtStream_t stream) {
+
+    if (datatype != INFINI_DTYPE_F32 && datatype != INFINI_DTYPE_F16) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    CHECK_CNCL(cnclAllReduce(sendbuf, recvbuf, count, getCnclDtype(datatype),
+                             getCnclRedOp(op), getCnclComm(comm),
+                             getCambriconStream(stream)));
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace infiniccl::cambricon
@@ -0,0 +1,12 @@
+#ifndef INFINICCL_CAMBRICON_H_
+#define INFINICCL_CAMBRICON_H_
+
+#include "../infiniccl_impl.h"
+
+#if defined(ENABLE_CAMBRICON_API) && defined(ENABLE_CCL)
+INFINICCL_DEVICE_API_IMPL(cambricon)
+#else
+INFINICCL_DEVICE_API_NOOP(cambricon)
+#endif
+
+#endif /* INFINICCL_CAMBRICON_H_ */
@@ -1,6 +1,7 @@
 #include "infiniccl.h"
 
 #include "./ascend/infiniccl_ascend.h"
+#include "./cambricon/infiniccl_cambricon.h"
 #include "./cuda/infiniccl_cuda.h"
 #include "./metax/infiniccl_metax.h"
 
@@ -18,6 +19,7 @@ __C infiniStatus_t infinicclCommInitAll(
         COMM_INIT_ALL(INFINI_DEVICE_NVIDIA, cuda);
         COMM_INIT_ALL(INFINI_DEVICE_ILUVATAR, cuda);
         COMM_INIT_ALL(INFINI_DEVICE_ASCEND, ascend);
+        COMM_INIT_ALL(INFINI_DEVICE_CAMBRICON, cambricon);
         COMM_INIT_ALL(INFINI_DEVICE_METAX, metax);
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -39,6 +41,7 @@ __C infiniStatus_t infinicclCommDestroy(infinicclComm_t comm) {
         COMM_DESTROY(INFINI_DEVICE_NVIDIA, cuda);
         COMM_DESTROY(INFINI_DEVICE_ILUVATAR, cuda);
         COMM_DESTROY(INFINI_DEVICE_ASCEND, ascend);
+        COMM_DESTROY(INFINI_DEVICE_CAMBRICON, cambricon);
         COMM_DESTROY(INFINI_DEVICE_METAX, metax);
 
     default:
@@ -68,6 +71,7 @@ __C infiniStatus_t infinicclAllReduce(
         ALL_REDUCE(INFINI_DEVICE_NVIDIA, cuda);
         ALL_REDUCE(INFINI_DEVICE_ILUVATAR, cuda);
         ALL_REDUCE(INFINI_DEVICE_ASCEND, ascend);
+        ALL_REDUCE(INFINI_DEVICE_CAMBRICON, cambricon);
         ALL_REDUCE(INFINI_DEVICE_METAX, metax);
 
     default:
 
@@ -9,7 +9,7 @@ struct ParsedArgs {
     int device_id = 0;                              // CUDA device ID (if specified)
     int warmups = 0;                                // Default to 0 if not given
     int iterations = 0;                             // Default to 0 if not given
-    double atol = 0.001;                            // Default absolute tolerance
+    double atol = 0.0015;                           // Default absolute tolerance
     double rtol = 0.001;                            // Default relative tolerance
 };
 
 
@@ -7,6 +7,21 @@
 
 namespace device::bang::kernel {
 
+template <typename T>
+__mlu_device__ float to_float(const T &v) {
+    return static_cast<float>(v);
+}
+
+template <typename T>
+__mlu_device__ bfloat16_t to_bfloat16(const T &v) {
+    return static_cast<bfloat16_t>(v);
+}
+
+template <typename T>
+__mlu_device__ half to_half(const T &v) {
+    return static_cast<half>(v);
+}
+
 /**
  * @brief Converts a flattened index to a reduced offset considering broadcasting.
  *
 
@@ -4,17 +4,25 @@
 // This header file will only be include by .xpu file
 #include "xpu/runtime.h"
 #include <xpu/kernel/xtdk.h>
+#include <xpu/kernel/xtdk_atomic_sm_xpu3.h>
 #include <xpu/kernel/xtdk_bf16.h>
 #include <xpu/kernel/xtdk_math.h>
 #include <xpu/kernel/xtdk_simd.h>
+#include <xpu/kernel/xtdk_trigonometric.h>
 
 namespace device::kunlun::kernel {
 
+#define SM_SIZE 10240
+
+/**
+ * @brief Define ptrdiff_t and size_t for kunlun xpu
+ * ptrdiff_t is 32 bit, size_t is 32 bit in xpu kernel
+ * We padding it into 64 bit for convience of DATACOPY
+ */
 typedef struct _ptrdiff_t {
     int32_t value;   // 32 bit
     int32_t padding; // 32 bit
 } _ptrdiff_t;
-
 // same as ptrdiff
 typedef struct _size_t {
     uint32_t value;
@@ -29,17 +37,83 @@ inline __device__ float lowerBitMask(int i) {
     return (1 << (i + 1)) - 1;
 }
 
-// Atomic add for reduce
-inline __device__ void atomicAddF32(__shared_ptr__ float *ptr, float value) {
-    int success = 1;
-    while (success) {
-        // SM2REG read 32bit data to register
-        float a = SM2REG_atomic(ptr);
-        a = a + value;
-        success = REG2SM_atomic(ptr, a);
+/**
+ * @brief Load data from shared memory
+ * @param p: pointer to shared memory
+ * @return loaded value
+ */
+template <typename T>
+__device__ inline T loadsm(__shared_ptr__ const T *p) {
+    T v;
+    if constexpr (std::is_same<T, half>::value
+                  || std::is_same<T, bfloat16_t>::value) {
+        __builtin_memcpy(&v, p, sizeof(T));
+    } else {
+        v = *p;
+    }
+    return v;
+}
+// Load len data from shared memory
+template <typename T>
+__device__ inline void loadsm(__shared_ptr__ const T *p, T *v, int len) {
+    __builtin_memcpy(v, p, len * sizeof(T));
+}
+
+/**
+ * @brief Convert data type. All data is in local memory
+ * @param v: input value
+ * @return output value
+ */
+template <typename Tout, typename Tin>
+__device__ inline Tout to(Tin v) {
+    if constexpr (std::is_same<Tin, half>::value) {
+        return __half2float(v);
+    } else if constexpr (std::is_same<Tin, bfloat16_t>::value) {
+        return __bfloat162float(v);
+    } else {
+        return static_cast<Tout>(v);
     }
 }
 
+/**
+ * @brief atomicAdd for kunlun xpu
+ * @param ptr: pointer to shared memory
+ * @param value: value to add
+ */
+template <typename T>
+inline __device__ T atomicAdd(__shared_ptr__ T *ptr, T value) {
+    T x = atomicadd(ptr, value);
+    return x;
+}
+// Specialize atomicAdd for half
+template <>
+inline __device__ half atomicAdd<half>(__shared_ptr__ half *ptr, half value) {
+    ticket_lock_mix();
+    __half old = loadsm(ptr);
+    float of = __half2float(old);
+    float vf = __half2float(value);
+    float sumf = of + vf;
+    half sum = __float2half_rn(sumf);
+    *ptr = sum;
+    mfence_sm();
+    ticket_unlock_mix();
+    return old;
+}
+// Specialize atomicAdd for bfloat16_t
+template <>
+inline __device__ bfloat16_t atomicAdd<bfloat16_t>(__shared_ptr__ bfloat16_t *ptr, bfloat16_t value) {
+    ticket_lock_mix();
+    bfloat16_t old = loadsm(ptr);
+    float of = __bfloat162float(old);
+    float vf = __bfloat162float(value);
+    float sumf = of + vf;
+    bfloat16_t sum = __float2bfloat16_rn(sumf);
+    *ptr = sum;
+    mfence_sm();
+    ticket_unlock_mix();
+    return old;
+}
+
 /**
  * @brief Get index of broadcasted input
  * flat_index: flatten index of output tensor
@@ -85,5 +159,3 @@ inline __device__ int indexToOffset(
 } // namespace device::kunlun::kernel
 
 #endif // __INFINIOP_KUNLUN_KERNEL_COMMON_H__
-// TODO: atomicAddF16
-// TODO: atomicAddI8
 
@@ -11,8 +11,8 @@
 
 #define CHECK_MOORE(API) CHECK_INTERNAL(API, musaSuccess)
 
-using musa_bfloat16 = mt_bfloat16;
-using musa_bfloat162 = mt_bfloat162;
+using cuda_bfloat16 = mt_bfloat16;
+using cuda_bfloat162 = mt_bfloat162;
 
 namespace device::moore {
 
@@ -52,6 +52,11 @@ exp_(const float val) {
     return expf(val);
 }
 
+__forceinline__ __device__ long double
+exp_(const long double val) {
+    return exp(val);
+}
+
 __forceinline__ __device__ double
 exp_(const double val) {
     return exp(val);