Skip to content

Commit 3959c94

Browse files
authored
Issue/472: 接入昆仑芯通信库 (#479)
* issue/472: p800 ccl * issue/472: 删掉无用操作 * issue/472: fix format * issue/472: memcpy h2h case
1 parent 20a2dbd commit 3959c94

File tree

7 files changed

+165
-7
lines changed

7 files changed

+165
-7
lines changed

src/infiniccl-test/infiniccl_test.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ void *testAllReduceThread(void *arg) {
114114
TEST_INFINI_THREAD(infinirtMalloc(&buf, args->count * infiniSizeOf(args->dtype)));
115115
TEST_INFINI_THREAD(infinirtMemcpy(buf, args->data, args->count * infiniSizeOf(args->dtype), INFINIRT_MEMCPY_H2D));
116116
TEST_INFINI_THREAD(infinicclAllReduce(buf, buf, args->count, args->dtype, INFINICCL_SUM, args->comm, stream));
117-
TEST_INFINI_THREAD(infinirtDeviceSynchronize());
117+
TEST_INFINI_THREAD(infinirtStreamSynchronize(stream));
118118
TEST_INFINI_THREAD(infinirtMemcpy(output, buf, args->count * infiniSizeOf(args->dtype), INFINIRT_MEMCPY_D2H));
119119

120120
if (checkData(output, args->ans, args->dtype, args->count) != 0) {
@@ -126,14 +126,14 @@ void *testAllReduceThread(void *arg) {
126126
for (size_t i = 0; i < WARM_UPS; i++) {
127127
TEST_INFINI_THREAD(infinicclAllReduce(buf, buf, args->count, args->dtype, INFINICCL_SUM, args->comm, stream));
128128
}
129-
TEST_INFINI_THREAD(infinirtDeviceSynchronize());
129+
TEST_INFINI_THREAD(infinirtStreamSynchronize(stream));
130130

131131
// measure time
132132
auto start = std::chrono::high_resolution_clock::now();
133133
for (size_t i = 0; i < ITERATIONS; i++) {
134134
TEST_INFINI_THREAD(infinicclAllReduce(buf, buf, args->count, args->dtype, INFINICCL_SUM, args->comm, stream));
135135
}
136-
TEST_INFINI_THREAD(infinirtDeviceSynchronize());
136+
TEST_INFINI_THREAD(infinirtStreamSynchronize(stream));
137137
auto end = std::chrono::high_resolution_clock::now();
138138
double elapsed_ms = std::chrono::duration<double, std::milli>(end - start).count();
139139
*args->time = elapsed_ms / ITERATIONS;
@@ -159,12 +159,12 @@ int testAllReduce(infiniDevice_t device_type, int ndevice) {
159159
for (int i = 0; i < ndevice; i++) {
160160
device_ids[i] = i;
161161
}
162-
TEST_INFINI(infinicclCommInitAll(device_type, comms.data(), ndevice, device_ids.data()));
163162

164163
for (infiniDtype_t dtype : TEST_DTYPES) {
165164
setData(dtype, data, MAX_COUNT, 1.0f);
166165
setData(dtype, ans, MAX_COUNT, 1.0f * ndevice);
167166
for (size_t count : TEST_COUNTS) {
167+
TEST_INFINI(infinicclCommInitAll(device_type, comms.data(), ndevice, device_ids.data()));
168168
std::cout << "Testing AllReduce with " << count << " elements of " << infiniDtypeToString(dtype) << std::endl;
169169
for (int rank = 0; rank < ndevice; rank++) {
170170
thread_args[rank] = {rank, device_ids[rank], comms[rank], device_type, dtype, count, data, ans, &results[rank], &times[rank]};

src/infiniccl/infiniccl.cc

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include "./ascend/infiniccl_ascend.h"
44
#include "./cambricon/infiniccl_cambricon.h"
55
#include "./cuda/infiniccl_cuda.h"
6+
#include "./kunlun/infiniccl_kunlun.h"
67
#include "./metax/infiniccl_metax.h"
78
#include "./moore/infiniccl_moore.h"
89

@@ -23,6 +24,7 @@ __C infiniStatus_t infinicclCommInitAll(
2324
COMM_INIT_ALL(INFINI_DEVICE_CAMBRICON, cambricon);
2425
COMM_INIT_ALL(INFINI_DEVICE_METAX, metax);
2526
COMM_INIT_ALL(INFINI_DEVICE_MOORE, moore);
27+
COMM_INIT_ALL(INFINI_DEVICE_KUNLUN, kunlun);
2628
default:
2729
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
2830
}
@@ -46,7 +48,7 @@ __C infiniStatus_t infinicclCommDestroy(infinicclComm_t comm) {
4648
COMM_DESTROY(INFINI_DEVICE_CAMBRICON, cambricon);
4749
COMM_DESTROY(INFINI_DEVICE_METAX, metax);
4850
COMM_DESTROY(INFINI_DEVICE_MOORE, moore);
49-
51+
COMM_DESTROY(INFINI_DEVICE_KUNLUN, kunlun);
5052
default:
5153
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
5254
}
@@ -77,6 +79,7 @@ __C infiniStatus_t infinicclAllReduce(
7779
ALL_REDUCE(INFINI_DEVICE_CAMBRICON, cambricon);
7880
ALL_REDUCE(INFINI_DEVICE_METAX, metax);
7981
ALL_REDUCE(INFINI_DEVICE_MOORE, moore);
82+
ALL_REDUCE(INFINI_DEVICE_KUNLUN, kunlun);
8083

8184
default:
8285
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
#include "infiniccl_kunlun.h"
2+
3+
#include "../../utils.h"
4+
5+
#include <bkcl.h>
6+
7+
#include <iostream>
8+
#include <vector>
9+
10+
#define CHECK_BKCL(API__) CHECK_INTERNAL(API__, BKCL_SUCCESS)
11+
12+
typedef XPUStream kunlunStream_t;
13+
typedef BKCLContext_t bkclComm_t;
14+
15+
inline kunlunStream_t getKunlunStream(infinirtStream_t stream) {
16+
if (stream == nullptr) {
17+
return 0;
18+
}
19+
return reinterpret_cast<kunlunStream_t>(stream);
20+
}
21+
22+
inline bkclComm_t getBkclComm(infinicclComm_t comm) {
23+
return reinterpret_cast<bkclComm_t>(comm->comm);
24+
}
25+
26+
inline BKCLDataType getBkclDtype(infiniDtype_t datatype) {
27+
switch (datatype) {
28+
case INFINI_DTYPE_F32:
29+
return BKCL_FLOAT;
30+
case INFINI_DTYPE_F16:
31+
return BKCL_FLOAT16;
32+
case INFINI_DTYPE_BF16:
33+
return BKCL_BFLOAT16;
34+
default:
35+
std::cerr << "Unsupported data type: " << datatype << std::endl;
36+
std::abort();
37+
return BKCL_FLOAT16;
38+
}
39+
}
40+
41+
inline BKCLOp getBkclRedOp(infinicclReduceOp_t op) {
42+
switch (op) {
43+
case INFINICCL_SUM:
44+
return BKCL_ADD;
45+
case INFINICCL_PROD:
46+
return BKCL_PRODUCT;
47+
case INFINICCL_MAX:
48+
return BKCL_MAX;
49+
case INFINICCL_MIN:
50+
return BKCL_MIN;
51+
default:
52+
std::abort();
53+
return BKCL_ADD;
54+
}
55+
}
56+
57+
namespace infiniccl::kunlun {
58+
59+
infiniStatus_t commInitAll(
60+
infinicclComm_t *comms,
61+
int ndevice,
62+
const int *device_ids) {
63+
std::vector<bkclComm_t> bkcl_comms(ndevice);
64+
CHECK_BKCL(bkcl_comm_init_all(bkcl_comms.data(), ndevice, device_ids));
65+
66+
for (int i = 0; i < ndevice; i++) {
67+
comms[i] = new InfinicclComm{INFINI_DEVICE_KUNLUN, device_ids[i], (void *)(bkcl_comms[i])};
68+
}
69+
70+
return INFINI_STATUS_SUCCESS;
71+
}
72+
73+
infiniStatus_t commDestroy(infinicclComm_t comm) {
74+
CHECK_BKCL(bkcl_destroy_context(getBkclComm(comm)));
75+
delete comm;
76+
return INFINI_STATUS_SUCCESS;
77+
}
78+
79+
infiniStatus_t allReduce(
80+
void *sendbuf,
81+
void *recvbuf,
82+
size_t count,
83+
infiniDtype_t datatype,
84+
infinicclReduceOp_t op,
85+
infinicclComm_t comm,
86+
infinirtStream_t stream) {
87+
CHECK_DTYPE(datatype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16);
88+
CHECK_BKCL(bkcl_all_reduce(
89+
getBkclComm(comm),
90+
sendbuf,
91+
recvbuf,
92+
count,
93+
getBkclDtype(datatype),
94+
getBkclRedOp(op),
95+
getKunlunStream(stream)));
96+
97+
return INFINI_STATUS_SUCCESS;
98+
}
99+
100+
} // namespace infiniccl::kunlun
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#ifndef INFINICCL_KUNLUN_H_
2+
#define INFINICCL_KUNLUN_H_
3+
4+
#include "../infiniccl_impl.h"
5+
6+
#if defined(ENABLE_KUNLUN_API) && defined(ENABLE_CCL)
7+
INFINICCL_DEVICE_API_IMPL(kunlun)
8+
#else
9+
INFINICCL_DEVICE_API_NOOP(kunlun)
10+
#endif
11+
12+
#endif /* INFINICCL_KUNLUN_H_ */

src/infinirt/kunlun/infinirt_kunlun.cc

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "infinirt_kunlun.h"
22
#include "../../utils.h"
3+
#include <cstring>
34
#include <xpu/runtime.h>
45
#include <xpu/runtime_ex.h>
56

@@ -20,6 +21,8 @@ infiniStatus_t setDevice(int device_id) {
2021
}
2122

2223
infiniStatus_t deviceSynchronize() {
24+
// TODO: kunlun xpu has no device synchronization API
25+
// xpu_wait() is waiting for default stream
2326
CHECK_KUNLUNRT(xpu_wait());
2427
return INFINI_STATUS_SUCCESS;
2528
}
@@ -103,17 +106,36 @@ infiniStatus_t memcpy(void *dst, const void *src, size_t size, infinirtMemcpyKin
103106
case INFINIRT_MEMCPY_D2D:
104107
CHECK_KUNLUNRT(xpu_memcpy(dst, src, static_cast<uint64_t>(size), XPUMemcpyKind::XPU_DEVICE_TO_DEVICE));
105108
return INFINI_STATUS_SUCCESS;
109+
case INFINIRT_MEMCPY_H2H:
110+
std::memcpy(dst, src, size);
111+
return INFINI_STATUS_SUCCESS;
106112
default:
107113
return INFINI_STATUS_INTERNAL_ERROR;
108114
}
109115
}
110116

111117
infiniStatus_t memcpyAsync(void *dst, const void *src, size_t size, infinirtMemcpyKind_t kind, infinirtStream_t stream) {
112-
// no async memcpy func in kunlun2
113-
return memcpy(dst, src, size, kind);
118+
switch (kind) {
119+
case INFINIRT_MEMCPY_H2D:
120+
CHECK_KUNLUNRT(xpu_memcpy_async(dst, src, static_cast<uint64_t>(size), XPUMemcpyKind::XPU_HOST_TO_DEVICE, (kunlunStream_t)stream));
121+
return INFINI_STATUS_SUCCESS;
122+
case INFINIRT_MEMCPY_D2H:
123+
CHECK_KUNLUNRT(xpu_memcpy_async(dst, src, static_cast<uint64_t>(size), XPUMemcpyKind::XPU_DEVICE_TO_HOST, (kunlunStream_t)stream));
124+
return INFINI_STATUS_SUCCESS;
125+
case INFINIRT_MEMCPY_D2D:
126+
CHECK_KUNLUNRT(xpu_memcpy_async(dst, src, static_cast<uint64_t>(size), XPUMemcpyKind::XPU_DEVICE_TO_DEVICE, (kunlunStream_t)stream));
127+
return INFINI_STATUS_SUCCESS;
128+
case INFINIRT_MEMCPY_H2H:
129+
std::memcpy(dst, src, size);
130+
return INFINI_STATUS_SUCCESS;
131+
default:
132+
return INFINI_STATUS_INTERNAL_ERROR;
133+
}
114134
}
115135

116136
infiniStatus_t mallocAsync(void **p_ptr, size_t size, infinirtStream_t stream) {
137+
// kunlun3 does not support async memory allocation
138+
// TODO: support async malloc
117139
CHECK_KUNLUNRT(xpu_malloc(p_ptr, static_cast<uint64_t>(size)));
118140
return INFINI_STATUS_SUCCESS;
119141
}

xmake.lua

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,9 @@ target("infiniccl")
303303
if has_config("moore-gpu") then
304304
add_deps("infiniccl-moore")
305305
end
306+
if has_config("kunlun-xpu") then
307+
add_deps("infiniccl-kunlun")
308+
end
306309

307310
set_languages("cxx17")
308311

xmake/kunlun.lua

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ local XRE_DIR = path.join(KUNLUN_HOME, "xre")
44
local XTDK_DIR = path.join(KUNLUN_HOME, "xtdk")
55
local XDNN_DIR = path.join(KUNLUN_HOME, "xhpc", "xdnn")
66
local XBLAS_DIR = path.join(KUNLUN_HOME, "xhpc", "xblas")
7+
local XCCL_DIR = path.join(KUNLUN_HOME, "xccl")
78

89
-- Add include dirs
910
add_includedirs(path.join(XRE_DIR, "include"), {public = true})
@@ -15,6 +16,8 @@ add_includedirs(path.join(XBLAS_DIR, "include"), {public = true})
1516
add_linkdirs(path.join(XRE_DIR, "so"))
1617
add_linkdirs(path.join(XDNN_DIR, "so"))
1718
add_linkdirs(path.join(XBLAS_DIR, "so"))
19+
20+
-- Add links
1821
add_links("xpurt", "xpuapi", "xpu_blas")
1922

2023
rule("xpu")
@@ -94,5 +97,20 @@ target("infinirt-kunlun")
9497
-- Add include dirs
9598
add_files("$(projectdir)/src/infinirt/kunlun/*.cc")
9699
add_cxflags("-lstdc++ -Wall -Werror -fPIC")
100+
target_end()
97101

102+
target("infiniccl-kunlun")
103+
set_kind("static")
104+
add_deps("infinirt")
105+
add_deps("infini-utils")
106+
set_warnings("all", "error")
107+
set_languages("cxx17")
108+
on_install(function (target) end)
109+
if has_config("ccl") then
110+
add_includedirs(path.join(XCCL_DIR, "include"))
111+
add_linkdirs(path.join(XCCL_DIR, "so"))
112+
add_links("bkcl")
113+
add_files("$(projectdir)/src/infiniccl/kunlun/*.cc")
114+
add_cxflags("-lstdc++ -fPIC")
115+
end
98116
target_end()

0 commit comments

Comments
 (0)