Skip to content

Commit 52eee8d

Browse files
Nikita Gusevblazej-smorawskijbrosenzranukunddependabot[bot]
authored
Intel(R) oneAPI Collective Communications Library (oneCCL) 2021.15.5 (#183)
* Intel(R) oneAPI Collective Communications Library (oneCCL) 2021.15.5 * infra: bump cmake version * Update README.md to provide notice of deprecation of legacy C++ API. * Update README.md Added * to NCCL to refer the reader to the new Notices and Disclaimers section (taken from our release notes) which addresses the trademark owership for NCCL. Used improvements suggested by Nikita. * Update README.md I learned that I need to use the * only on the first instance of the trademarked name. Removed the second *. * Update README.md Co-authored-by: Ranu Kundu <ranu.kundu@intel.com> * Update README.md Co-authored-by: Ranu Kundu <ranu.kundu@intel.com> * Update README.md Co-authored-by: Ranu Kundu <ranu.kundu@intel.com> * Update README.md Co-authored-by: Ranu Kundu <ranu.kundu@intel.com> * Bump setuptools from 75.1.0 to 78.1.1 in /doc Bumps [setuptools](https://github.com/pypa/setuptools) from 75.1.0 to 78.1.1. - [Release notes](https://github.com/pypa/setuptools/releases) - [Changelog](https://github.com/pypa/setuptools/blob/main/NEWS.rst) - [Commits](pypa/setuptools@v75.1.0...v78.1.1) --- updated-dependencies: - dependency-name: setuptools dependency-version: 78.1.1 dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com> --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: Błażej Smorawski <blazej.smorawski@intel.com> Co-authored-by: Joel Rosenzweig <joel.b.rosenzweig@intel.com> Co-authored-by: Ranu Kundu <ranu.kundu@intel.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
1 parent 286870a commit 52eee8d

32 files changed

+931
-373
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -335,7 +335,7 @@ endif()
335335

336336
set(CCL_MAJOR_VERSION "2021")
337337
set(CCL_MINOR_VERSION "15")
338-
set(CCL_UPDATE_VERSION "4")
338+
set(CCL_UPDATE_VERSION "5")
339339
set(CCL_PRODUCT_STATUS "Gold")
340340
string(TIMESTAMP CCL_PRODUCT_BUILD_DATE "%Y-%m-%dT %H:%M:%SZ")
341341
get_vcs_properties("git")

include/oneapi/ccl/config.h

Lines changed: 0 additions & 43 deletions
This file was deleted.

man/doxconfig

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
PROJECT_NAME = "Intel® oneAPI Collective Communications Library"
2-
PROJECT_NUMBER = "2021.15.4"
2+
PROJECT_NUMBER = "2021.15.5"
33

44
INPUT = ../src/common/env/vars.hpp ../src/common/env/vars_experimental.hpp
55

src/coll/algorithms/allgatherv/sycl/allgatherv_pcie.cpp

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ ccl::event allgatherv_ll_ring(const void *send_buf,
4545
bool p2p = node_comm->get_topo_manager().has_p2p_access();
4646
uint32_t pattern = comm->get_rt_pattern(pattern_type::collective, -1);
4747

48-
auto lambda = [&]<typename T, int NRanks, template <typename, int> class Proto>() {
48+
auto lambda = [&]<typename T, template <typename, int> class Proto>(int NRanks) {
4949
T *peerbuf0[NRanks];
5050
T *peerbuf1[NRanks];
5151
for (int i = 0; i < NRanks; i++) {
@@ -54,30 +54,31 @@ ccl::event allgatherv_ll_ring(const void *send_buf,
5454
}
5555
T *ipcbuf0 = (T *)get_tmp_buf(0, comm);
5656
T *ipcbuf1 = (T *)get_tmp_buf(1, comm);
57-
sycl::event e = AllGather<T, NRanks, Proto, RingTransmit>::launch((T *)send_buf,
58-
(T *)recv_buf,
59-
ipcbuf0,
60-
ipcbuf1,
61-
peerbuf0,
62-
peerbuf1,
63-
send_count,
64-
comm_rank,
65-
pattern,
66-
q,
67-
p2p,
68-
done);
57+
sycl::event e = AllGather<T, Proto, RingTransmit>::launch(NRanks,
58+
(T *)send_buf,
59+
(T *)recv_buf,
60+
ipcbuf0,
61+
ipcbuf1,
62+
peerbuf0,
63+
peerbuf1,
64+
send_count,
65+
comm_rank,
66+
pattern,
67+
q,
68+
p2p,
69+
done);
6970
// update pattern
7071
comm->update_rt_pattern(pattern_type::collective, -1, pattern);
7172
return e;
7273
};
7374

7475
if (send_size <= ccl::global_data::env().sycl_allgatherv_ll_threshold) {
7576
// small ring with LL
76-
sycl_e = invoke_pcie<Rt64_PCIE>(lambda, comm, dtype);
77+
sycl_e = invoke_pcie_type<Rt64_PCIE>(lambda, comm_size, dtype);
7778
}
7879
else {
7980
// simple ring with LL256
80-
sycl_e = invoke_pcie<Rt64_128_PCIE>(lambda, comm, dtype);
81+
sycl_e = invoke_pcie_type<Rt64_128_PCIE>(lambda, comm_size, dtype);
8182
}
8283

8384
return ccl::event::create_from_native(sycl_e);

src/coll/algorithms/allgatherv/sycl/allgatherv_pcie.hpp

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -21,19 +21,19 @@
2121
#include "coll/algorithms/utils/transmit/transmit.hpp"
2222

2323
template <typename T,
24-
int NRanks,
2524
template <typename, int>
2625
class Proto,
27-
template <typename, int, template <typename, int> class, int>
26+
template <typename, template <typename, int> class, int>
2827
class Transmit,
2928
int SubGroupSize = 16>
30-
struct AllGather : public Transmit<T, NRanks, Proto, SubGroupSize> {
31-
using Super = Transmit<T, NRanks, Proto, SubGroupSize>;
29+
struct AllGather : public Transmit<T, Proto, SubGroupSize> {
30+
using Super = Transmit<T, Proto, SubGroupSize>;
3231
using message_t = typename Super::message_t;
3332
constexpr static int wireCapacity = Super::wireCapacity;
3433
using Super::runAllgather;
3534

36-
AllGather(T* input,
35+
AllGather(int nranks,
36+
T* input,
3737
T* output,
3838
size_t nelems,
3939
int rank,
@@ -43,16 +43,17 @@ struct AllGather : public Transmit<T, NRanks, Proto, SubGroupSize> {
4343
T* const peerBuf0[],
4444
T* const peerBuf1[],
4545
bool p2p)
46-
: Transmit<T, NRanks, Proto, SubGroupSize>(input,
47-
output,
48-
scatterBuf,
49-
gatherBuf,
50-
peerBuf0,
51-
peerBuf1,
52-
calcWorkSize(input, output, nelems * sizeof(T)),
53-
rank,
54-
seqNo,
55-
p2p),
46+
: Transmit<T, Proto, SubGroupSize>(nranks,
47+
input,
48+
output,
49+
scatterBuf,
50+
gatherBuf,
51+
peerBuf0,
52+
peerBuf1,
53+
calcWorkSize(input, output, nelems * sizeof(T)),
54+
rank,
55+
seqNo,
56+
p2p),
5657
workSize(calcWorkSize(input, output, nelems * sizeof(T))) {}
5758

5859
sycl::nd_range<1> getLaunchParam(uint32_t& updateSeqNo) const {
@@ -79,7 +80,8 @@ struct AllGather : public Transmit<T, NRanks, Proto, SubGroupSize> {
7980
return sycl::nd_range<1>(actualSS * wirePerSS * w * SubGroupSize, nThreads * SubGroupSize);
8081
}
8182

82-
static sycl::event launch(T* input,
83+
static sycl::event launch(int nranks,
84+
T* input,
8385
T* output,
8486
T* ipcbuf0,
8587
T* ipcbuf1,
@@ -92,7 +94,7 @@ struct AllGather : public Transmit<T, NRanks, Proto, SubGroupSize> {
9294
bool p2p,
9395
bool& done) {
9496
sycl::event e;
95-
AllGather offload(input, output, nelems, rank, step, ipcbuf0, ipcbuf1, peerbuf0, peerbuf1, p2p);
97+
AllGather offload(nranks, input, output, nelems, rank, step, ipcbuf0, ipcbuf1, peerbuf0, peerbuf1, p2p);
9698
if (offload.workSize == 0) {
9799
done = false;
98100
return e;

src/coll/algorithms/allgatherv/sycl/allgatherv_scaleout_sycl.cpp

Lines changed: 2 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -72,47 +72,9 @@ ccl::event allgatherv_scaleout_sycl_direct(sycl::queue& q,
7272
sycl_deps.push_back(ev);
7373
}
7474
else if (!is_cpu_buffers) {
75-
auto lib_attr = atl_mpi_ctx::get_lib_attr();
76-
if (lib_attr.type == atl_mpi_ctx::ATL_MPI_LIB_IMPI && lib_attr.hmem == 1) {
77-
const char* env_val = getenv("I_MPI_OFFLOAD");
78-
int offload = 0;
79-
if (env_val != nullptr)
80-
offload = atoi(env_val);
81-
82-
if (offload == 0) {
83-
LOG_INFO("copy_to_host=false with a GPU buffer. "
84-
"make sure I_MPI_OFFLOAD is set or GPU RDMA is enabled");
85-
done = false;
86-
ccl::event e;
87-
return e;
88-
}
89-
}
90-
else if (lib_attr.type == atl_mpi_ctx::ATL_MPI_LIB_MPICH && lib_attr.hmem == 1) {
91-
const char* env_val = getenv("MPIR_CVAR_CH4_OFI_ENABLE_HMEM");
92-
int gpu_rdma = 0;
93-
if (env_val != nullptr)
94-
gpu_rdma = atoi(env_val);
95-
96-
env_val = getenv("MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE");
97-
int gpu_pipeline = 0;
98-
if (env_val != nullptr)
99-
gpu_pipeline = atoi(env_val);
100-
101-
if (!gpu_rdma && !gpu_pipeline) {
102-
LOG_INFO(
103-
"copy_to_host=false with a GPU buffer. "
104-
"make sure MPIR_CVAR_CH4_OFI_ENABLE_HMEM or MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE are set or GPU RDMA is enabled");
105-
done = false;
106-
ccl::event e;
107-
return e;
108-
}
109-
}
110-
else {
75+
if (!check_mpi_supports_rdma()) {
11176
LOG_INFO("copy_to_host=false with a GPU buffer. "
112-
"no transport with GPU RDMA enabled was detected");
113-
done = false;
114-
ccl::event e;
115-
return e;
77+
"make sure MPI GPU RDMA is enabled");
11678
}
11779
}
11880

src/coll/algorithms/allreduce/sycl/allreduce_pcie.cpp

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ ccl::event allreduce_ll_ring(const void *src,
4444
bool p2p = node_comm->get_topo_manager().has_p2p_access();
4545
uint32_t pattern = comm->get_rt_pattern(pattern_type::collective, -1);
4646

47-
auto lambda = [&]<typename T, int NRanks, template <typename, int> class Proto>() {
47+
auto lambda = [&]<typename T, template <typename, int> class Proto>(int NRanks) {
4848
T *peerbuf0[NRanks];
4949
T *peerbuf1[NRanks];
5050
for (int i = 0; i < NRanks; i++) {
@@ -53,28 +53,29 @@ ccl::event allreduce_ll_ring(const void *src,
5353
}
5454
T *ipcbuf0 = (T *)get_tmp_buf(0, comm);
5555
T *ipcbuf1 = (T *)get_tmp_buf(1, comm);
56-
sycl::event e = AllReduce<T, NRanks, Proto, RingTransmit>::launch((T *)dst,
57-
ipcbuf0,
58-
ipcbuf1,
59-
peerbuf0,
60-
peerbuf1,
61-
count,
62-
comm_rank,
63-
pattern,
64-
q,
65-
p2p,
66-
done);
56+
sycl::event e = AllReduce<T, Proto, RingTransmit>::launch(NRanks,
57+
(T *)dst,
58+
ipcbuf0,
59+
ipcbuf1,
60+
peerbuf0,
61+
peerbuf1,
62+
count,
63+
comm_rank,
64+
pattern,
65+
q,
66+
p2p,
67+
done);
6768
comm->update_rt_pattern(pattern_type::collective, -1, pattern);
6869
return e;
6970
};
7071

7172
if (count * dt_sz <= ccl::global_data::env().sycl_allreduce_ll_threshold) {
7273
// small ring with LL
73-
sycl_e = invoke_pcie<Rt64_PCIE>(lambda, comm, dtype);
74+
sycl_e = invoke_pcie_type<Rt64_PCIE>(lambda, comm_size, dtype);
7475
}
7576
else {
7677
// simple ring with LL256
77-
sycl_e = invoke_pcie<Rt64_128_PCIE>(lambda, comm, dtype);
78+
sycl_e = invoke_pcie_type<Rt64_128_PCIE>(lambda, comm_size, dtype);
7879
}
7980

8081
if (reduction == ccl::reduction::avg) {

src/coll/algorithms/allreduce/sycl/allreduce_pcie.hpp

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -21,18 +21,18 @@
2121
#include "coll/algorithms/utils/transmit/transmit.hpp"
2222

2323
template <typename T,
24-
int NRanks,
2524
template <typename, int>
2625
class Proto,
27-
template <typename, int, template <typename, int> class, int>
26+
template <typename, template <typename, int> class, int>
2827
class Transmit,
2928
int SubGroupSize = 16>
30-
struct AllReduce : public Transmit<T, NRanks, Proto, SubGroupSize> {
31-
using Super = Transmit<T, NRanks, Proto, SubGroupSize>;
29+
struct AllReduce : public Transmit<T, Proto, SubGroupSize> {
30+
using Super = Transmit<T, Proto, SubGroupSize>;
3231
using message_t = typename Super::message_t;
3332
constexpr static int wireCapacity = Super::wireCapacity;
3433

35-
AllReduce(T* input,
34+
AllReduce(int nranks,
35+
T* input,
3636
size_t nelems,
3737
int rank,
3838
uint32_t seqNo,
@@ -41,16 +41,17 @@ struct AllReduce : public Transmit<T, NRanks, Proto, SubGroupSize> {
4141
T* const peerBuf0[],
4242
T* const peerBuf1[],
4343
bool p2p)
44-
: Transmit<T, NRanks, Proto, SubGroupSize>(input,
45-
scatterBuf,
46-
gatherBuf,
47-
peerBuf0,
48-
peerBuf1,
49-
calcWorkSize(input, nelems * sizeof(T)),
50-
rank,
51-
seqNo,
52-
p2p),
53-
workSize(calcWorkSize(input, nelems * sizeof(T))) {}
44+
: Transmit<T, Proto, SubGroupSize>(nranks,
45+
input,
46+
scatterBuf,
47+
gatherBuf,
48+
peerBuf0,
49+
peerBuf1,
50+
calcWorkSize(input, nelems * sizeof(T), nranks),
51+
rank,
52+
seqNo,
53+
p2p),
54+
workSize(calcWorkSize(input, nelems * sizeof(T), nranks)) {}
5455

5556
static int scatterVerify(uint32_t* host, int rank, uint32_t flag, size_t nWorkElemsInInt);
5657
static int stage2Verify(T* host, int rank, uint32_t flag, size_t nWorkElemsInInt);
@@ -80,7 +81,8 @@ struct AllReduce : public Transmit<T, NRanks, Proto, SubGroupSize> {
8081
return sycl::nd_range<1>(actualSS * wirePerSS * w * SubGroupSize, nThreads * SubGroupSize);
8182
}
8283

83-
static sycl::event launch(T* input,
84+
static sycl::event launch(int nranks,
85+
T* input,
8486
T* ipcbuf0,
8587
T* ipcbuf1,
8688
T* const peerbuf0[],
@@ -92,7 +94,8 @@ struct AllReduce : public Transmit<T, NRanks, Proto, SubGroupSize> {
9294
bool p2p,
9395
bool& done) {
9496
sycl::event e;
95-
AllReduce offload(input, nelems, rank, step, ipcbuf0, ipcbuf1, peerbuf0, peerbuf1, p2p);
97+
AllReduce offload(
98+
nranks, input, nelems, rank, step, ipcbuf0, ipcbuf1, peerbuf0, peerbuf1, p2p);
9699
if (offload.workSize == 0) {
97100
done = false;
98101
return e;
@@ -134,7 +137,7 @@ struct AllReduce : public Transmit<T, NRanks, Proto, SubGroupSize> {
134137

135138
private:
136139
// TODO: buffer plan and start point calc
137-
static size_t calcWorkSize(T* input, size_t size) {
140+
static size_t calcWorkSize(T* input, size_t size, int NRanks) {
138141
// Input must be message size align
139142
if ((uintptr_t)input % sizeof(message_t) != 0)
140143
throw std::logic_error("We only support aligned pointer for now");

0 commit comments

Comments
 (0)