Skip to content

Commit aa1c37c

Browse files
authored
Merge branch 'InfiniTensor:main' into T1-3-1
2 parents 4dde162 + e20c000 commit aa1c37c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+3080
-358
lines changed
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
#include "infiniccl_cambricon.h"
2+
3+
#include "../../utils.h"
4+
#include <cncl.h>
5+
#include <cnrt.h>
6+
#include <iostream>
7+
#include <vector>
8+
9+
#define CHECK_CNCL(API__) CHECK_INTERNAL(API__, CNCL_RET_SUCCESS)
10+
11+
inline cnrtQueue_t getCambriconStream(infinirtStream_t stream) {
12+
if (stream == nullptr) {
13+
return (cnrtQueue_t)(0);
14+
}
15+
return static_cast<cnrtQueue_t>(stream);
16+
}
17+
18+
inline cnclComm_t getCnclComm(infinicclComm_t comm) {
19+
return static_cast<cnclComm_t>(comm->comm);
20+
}
21+
22+
inline cnclDataType_t getCnclDtype(infiniDtype_t datatype) {
23+
switch (datatype) {
24+
case INFINI_DTYPE_F32:
25+
return cnclFloat32;
26+
case INFINI_DTYPE_F16:
27+
return cnclFloat16;
28+
default:
29+
std::cerr << "Unsupported data type: " << datatype << std::endl;
30+
std::abort();
31+
return cnclFloat16;
32+
}
33+
}
34+
35+
inline cnclReduceOp_t getCnclRedOp(infinicclReduceOp_t op) {
36+
switch (op) {
37+
case INFINICCL_SUM:
38+
return cnclSum;
39+
case INFINICCL_PROD:
40+
return cnclProd;
41+
case INFINICCL_MAX:
42+
return cnclMax;
43+
case INFINICCL_MIN:
44+
return cnclMin;
45+
default:
46+
std::abort();
47+
return cnclSum;
48+
}
49+
}
50+
51+
namespace infiniccl::cambricon {
52+
53+
infiniStatus_t commInitAll(
54+
infinicclComm_t *comms,
55+
int ndevice,
56+
const int *device_ids) {
57+
58+
std::vector<cnclComm_t> cncl_comms(ndevice);
59+
std::vector<int> rank_list(ndevice);
60+
61+
for (int i = 0; i < ndevice; i++) {
62+
rank_list[i] = i;
63+
CHECK_INTERNAL(cnrtSetDevice(device_ids[i]), CNRT_RET_SUCCESS);
64+
}
65+
66+
CHECK_CNCL(cnclInitComms(cncl_comms.data(), ndevice,
67+
(int const *)device_ids, rank_list.data(),
68+
ndevice, nullptr));
69+
70+
for (int i = 0; i < ndevice; i++) {
71+
comms[i] = new InfinicclComm{INFINI_DEVICE_CAMBRICON, device_ids[i], (void *)(cncl_comms[i])};
72+
}
73+
74+
return INFINI_STATUS_SUCCESS;
75+
}
76+
77+
infiniStatus_t commDestroy(infinicclComm_t comm) {
78+
CHECK_CNCL(cnclFreeComm(getCnclComm(comm)));
79+
delete comm;
80+
return INFINI_STATUS_SUCCESS;
81+
}
82+
83+
infiniStatus_t allReduce(
84+
void *sendbuf,
85+
void *recvbuf,
86+
size_t count,
87+
infiniDtype_t datatype,
88+
infinicclReduceOp_t op,
89+
infinicclComm_t comm,
90+
infinirtStream_t stream) {
91+
92+
if (datatype != INFINI_DTYPE_F32 && datatype != INFINI_DTYPE_F16) {
93+
return INFINI_STATUS_BAD_PARAM;
94+
}
95+
96+
CHECK_CNCL(cnclAllReduce(sendbuf, recvbuf, count, getCnclDtype(datatype),
97+
getCnclRedOp(op), getCnclComm(comm),
98+
getCambriconStream(stream)));
99+
100+
return INFINI_STATUS_SUCCESS;
101+
}
102+
} // namespace infiniccl::cambricon
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#ifndef INFINICCL_CAMBRICON_H_
2+
#define INFINICCL_CAMBRICON_H_
3+
4+
#include "../infiniccl_impl.h"
5+
6+
#if defined(ENABLE_CAMBRICON_API) && defined(ENABLE_CCL)
7+
INFINICCL_DEVICE_API_IMPL(cambricon)
8+
#else
9+
INFINICCL_DEVICE_API_NOOP(cambricon)
10+
#endif
11+
12+
#endif /* INFINICCL_CAMBRICON_H_ */

src/infiniccl/infiniccl.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "infiniccl.h"
22

33
#include "./ascend/infiniccl_ascend.h"
4+
#include "./cambricon/infiniccl_cambricon.h"
45
#include "./cuda/infiniccl_cuda.h"
56
#include "./metax/infiniccl_metax.h"
67

@@ -18,6 +19,7 @@ __C infiniStatus_t infinicclCommInitAll(
1819
COMM_INIT_ALL(INFINI_DEVICE_NVIDIA, cuda);
1920
COMM_INIT_ALL(INFINI_DEVICE_ILUVATAR, cuda);
2021
COMM_INIT_ALL(INFINI_DEVICE_ASCEND, ascend);
22+
COMM_INIT_ALL(INFINI_DEVICE_CAMBRICON, cambricon);
2123
COMM_INIT_ALL(INFINI_DEVICE_METAX, metax);
2224
default:
2325
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -39,6 +41,7 @@ __C infiniStatus_t infinicclCommDestroy(infinicclComm_t comm) {
3941
COMM_DESTROY(INFINI_DEVICE_NVIDIA, cuda);
4042
COMM_DESTROY(INFINI_DEVICE_ILUVATAR, cuda);
4143
COMM_DESTROY(INFINI_DEVICE_ASCEND, ascend);
44+
COMM_DESTROY(INFINI_DEVICE_CAMBRICON, cambricon);
4245
COMM_DESTROY(INFINI_DEVICE_METAX, metax);
4346

4447
default:
@@ -68,6 +71,7 @@ __C infiniStatus_t infinicclAllReduce(
6871
ALL_REDUCE(INFINI_DEVICE_NVIDIA, cuda);
6972
ALL_REDUCE(INFINI_DEVICE_ILUVATAR, cuda);
7073
ALL_REDUCE(INFINI_DEVICE_ASCEND, ascend);
74+
ALL_REDUCE(INFINI_DEVICE_CAMBRICON, cambricon);
7175
ALL_REDUCE(INFINI_DEVICE_METAX, metax);
7276

7377
default:

src/infiniop-test/src/main.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ struct ParsedArgs {
99
int device_id = 0; // CUDA device ID (if specified)
1010
int warmups = 0; // Default to 0 if not given
1111
int iterations = 0; // Default to 0 if not given
12-
double atol = 0.001; // Default absolute tolerance
12+
double atol = 0.0015; // Default absolute tolerance
1313
double rtol = 0.001; // Default relative tolerance
1414
};
1515

src/infiniop/devices/bang/bang_kernel_common.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,21 @@
77

88
namespace device::bang::kernel {
99

10+
template <typename T>
11+
__mlu_device__ float to_float(const T &v) {
12+
return static_cast<float>(v);
13+
}
14+
15+
template <typename T>
16+
__mlu_device__ bfloat16_t to_bfloat16(const T &v) {
17+
return static_cast<bfloat16_t>(v);
18+
}
19+
20+
template <typename T>
21+
__mlu_device__ half to_half(const T &v) {
22+
return static_cast<half>(v);
23+
}
24+
1025
/**
1126
* @brief Converts a flattened index to a reduced offset considering broadcasting.
1227
*

src/infiniop/devices/kunlun/kunlun_kernel_common.h

Lines changed: 83 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,25 @@
44
// This header file will only be include by .xpu file
55
#include "xpu/runtime.h"
66
#include <xpu/kernel/xtdk.h>
7+
#include <xpu/kernel/xtdk_atomic_sm_xpu3.h>
78
#include <xpu/kernel/xtdk_bf16.h>
89
#include <xpu/kernel/xtdk_math.h>
910
#include <xpu/kernel/xtdk_simd.h>
11+
#include <xpu/kernel/xtdk_trigonometric.h>
1012

1113
namespace device::kunlun::kernel {
1214

15+
#define SM_SIZE 10240
16+
17+
/**
18+
* @brief Define ptrdiff_t and size_t for kunlun xpu
19+
* ptrdiff_t is 32 bit, size_t is 32 bit in xpu kernel
20+
* We padding it into 64 bit for convience of DATACOPY
21+
*/
1322
typedef struct _ptrdiff_t {
1423
int32_t value; // 32 bit
1524
int32_t padding; // 32 bit
1625
} _ptrdiff_t;
17-
1826
// same as ptrdiff
1927
typedef struct _size_t {
2028
uint32_t value;
@@ -29,17 +37,83 @@ inline __device__ float lowerBitMask(int i) {
2937
return (1 << (i + 1)) - 1;
3038
}
3139

32-
// Atomic add for reduce
33-
inline __device__ void atomicAddF32(__shared_ptr__ float *ptr, float value) {
34-
int success = 1;
35-
while (success) {
36-
// SM2REG read 32bit data to register
37-
float a = SM2REG_atomic(ptr);
38-
a = a + value;
39-
success = REG2SM_atomic(ptr, a);
40+
/**
41+
* @brief Load data from shared memory
42+
* @param p: pointer to shared memory
43+
* @return loaded value
44+
*/
45+
template <typename T>
46+
__device__ inline T loadsm(__shared_ptr__ const T *p) {
47+
T v;
48+
if constexpr (std::is_same<T, half>::value
49+
|| std::is_same<T, bfloat16_t>::value) {
50+
__builtin_memcpy(&v, p, sizeof(T));
51+
} else {
52+
v = *p;
53+
}
54+
return v;
55+
}
56+
// Load len data from shared memory
57+
template <typename T>
58+
__device__ inline void loadsm(__shared_ptr__ const T *p, T *v, int len) {
59+
__builtin_memcpy(v, p, len * sizeof(T));
60+
}
61+
62+
/**
63+
* @brief Convert data type. All data is in local memory
64+
* @param v: input value
65+
* @return output value
66+
*/
67+
template <typename Tout, typename Tin>
68+
__device__ inline Tout to(Tin v) {
69+
if constexpr (std::is_same<Tin, half>::value) {
70+
return __half2float(v);
71+
} else if constexpr (std::is_same<Tin, bfloat16_t>::value) {
72+
return __bfloat162float(v);
73+
} else {
74+
return static_cast<Tout>(v);
4075
}
4176
}
4277

78+
/**
79+
* @brief atomicAdd for kunlun xpu
80+
* @param ptr: pointer to shared memory
81+
* @param value: value to add
82+
*/
83+
template <typename T>
84+
inline __device__ T atomicAdd(__shared_ptr__ T *ptr, T value) {
85+
T x = atomicadd(ptr, value);
86+
return x;
87+
}
88+
// Specialize atomicAdd for half
89+
template <>
90+
inline __device__ half atomicAdd<half>(__shared_ptr__ half *ptr, half value) {
91+
ticket_lock_mix();
92+
__half old = loadsm(ptr);
93+
float of = __half2float(old);
94+
float vf = __half2float(value);
95+
float sumf = of + vf;
96+
half sum = __float2half_rn(sumf);
97+
*ptr = sum;
98+
mfence_sm();
99+
ticket_unlock_mix();
100+
return old;
101+
}
102+
// Specialize atomicAdd for bfloat16_t
103+
template <>
104+
inline __device__ bfloat16_t atomicAdd<bfloat16_t>(__shared_ptr__ bfloat16_t *ptr, bfloat16_t value) {
105+
ticket_lock_mix();
106+
bfloat16_t old = loadsm(ptr);
107+
float of = __bfloat162float(old);
108+
float vf = __bfloat162float(value);
109+
float sumf = of + vf;
110+
bfloat16_t sum = __float2bfloat16_rn(sumf);
111+
*ptr = sum;
112+
mfence_sm();
113+
ticket_unlock_mix();
114+
return old;
115+
}
116+
43117
/**
44118
* @brief Get index of broadcasted input
45119
* flat_index: flatten index of output tensor
@@ -85,5 +159,3 @@ inline __device__ int indexToOffset(
85159
} // namespace device::kunlun::kernel
86160

87161
#endif // __INFINIOP_KUNLUN_KERNEL_COMMON_H__
88-
// TODO: atomicAddF16
89-
// TODO: atomicAddI8

src/infiniop/devices/moore/moore_kernel_common.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111

1212
#define CHECK_MOORE(API) CHECK_INTERNAL(API, musaSuccess)
1313

14-
using musa_bfloat16 = mt_bfloat16;
15-
using musa_bfloat162 = mt_bfloat162;
14+
using cuda_bfloat16 = mt_bfloat16;
15+
using cuda_bfloat162 = mt_bfloat162;
1616

1717
namespace device::moore {
1818

@@ -52,6 +52,11 @@ exp_(const float val) {
5252
return expf(val);
5353
}
5454

55+
__forceinline__ __device__ long double
56+
exp_(const long double val) {
57+
return exp(val);
58+
}
59+
5560
__forceinline__ __device__ double
5661
exp_(const double val) {
5762
return exp(val);

0 commit comments

Comments
 (0)