uxlfoundation
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/oneapi/ccl/config.h‎
Lines changed: 0 additions & 43 deletions b/‎include/oneapi/ccl/config.h‎
Lines changed: 0 additions & 43 deletions
diff --git a/‎man/doxconfig‎
Lines changed: 1 addition & 1 deletion b/‎man/doxconfig‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/coll/algorithms/allgatherv/sycl/allgatherv_pcie.cpp‎
Lines changed: 16 additions & 15 deletions b/‎src/coll/algorithms/allgatherv/sycl/allgatherv_pcie.cpp‎
Lines changed: 16 additions & 15 deletions
diff --git a/‎src/coll/algorithms/allgatherv/sycl/allgatherv_pcie.hpp‎
Lines changed: 19 additions & 17 deletions b/‎src/coll/algorithms/allgatherv/sycl/allgatherv_pcie.hpp‎
Lines changed: 19 additions & 17 deletions
diff --git a/‎src/coll/algorithms/allgatherv/sycl/allgatherv_scaleout_sycl.cpp‎
Lines changed: 2 additions & 40 deletions b/‎src/coll/algorithms/allgatherv/sycl/allgatherv_scaleout_sycl.cpp‎
Lines changed: 2 additions & 40 deletions
diff --git a/‎src/coll/algorithms/allreduce/sycl/allreduce_pcie.cpp‎
Lines changed: 15 additions & 14 deletions b/‎src/coll/algorithms/allreduce/sycl/allreduce_pcie.cpp‎
Lines changed: 15 additions & 14 deletions
diff --git a/‎src/coll/algorithms/allreduce/sycl/allreduce_pcie.hpp‎
Lines changed: 21 additions & 18 deletions b/‎src/coll/algorithms/allreduce/sycl/allreduce_pcie.hpp‎
Lines changed: 21 additions & 18 deletions
@@ -335,7 +335,7 @@ endif()
 
 set(CCL_MAJOR_VERSION     "2021")
 set(CCL_MINOR_VERSION     "15")
-set(CCL_UPDATE_VERSION    "4")
+set(CCL_UPDATE_VERSION    "5")
 set(CCL_PRODUCT_STATUS    "Gold")
 string(TIMESTAMP CCL_PRODUCT_BUILD_DATE "%Y-%m-%dT %H:%M:%SZ")
 get_vcs_properties("git")
 
@@ -1,5 +1,5 @@
 PROJECT_NAME           = "Intel® oneAPI Collective Communications Library"
-PROJECT_NUMBER         = "2021.15.4"
+PROJECT_NUMBER         = "2021.15.5"
 
 INPUT = ../src/common/env/vars.hpp ../src/common/env/vars_experimental.hpp
 
 
@@ -45,7 +45,7 @@ ccl::event allgatherv_ll_ring(const void *send_buf,
     bool p2p = node_comm->get_topo_manager().has_p2p_access();
     uint32_t pattern = comm->get_rt_pattern(pattern_type::collective, -1);
 
-    auto lambda = [&]<typename T, int NRanks, template <typename, int> class Proto>() {
+    auto lambda = [&]<typename T, template <typename, int> class Proto>(int NRanks) {
         T *peerbuf0[NRanks];
         T *peerbuf1[NRanks];
         for (int i = 0; i < NRanks; i++) {
@@ -54,30 +54,31 @@ ccl::event allgatherv_ll_ring(const void *send_buf,
         }
         T *ipcbuf0 = (T *)get_tmp_buf(0, comm);
         T *ipcbuf1 = (T *)get_tmp_buf(1, comm);
-        sycl::event e = AllGather<T, NRanks, Proto, RingTransmit>::launch((T *)send_buf,
-                                                                          (T *)recv_buf,
-                                                                          ipcbuf0,
-                                                                          ipcbuf1,
-                                                                          peerbuf0,
-                                                                          peerbuf1,
-                                                                          send_count,
-                                                                          comm_rank,
-                                                                          pattern,
-                                                                          q,
-                                                                          p2p,
-                                                                          done);
+        sycl::event e = AllGather<T, Proto, RingTransmit>::launch(NRanks,
+                                                                  (T *)send_buf,
+                                                                  (T *)recv_buf,
+                                                                  ipcbuf0,
+                                                                  ipcbuf1,
+                                                                  peerbuf0,
+                                                                  peerbuf1,
+                                                                  send_count,
+                                                                  comm_rank,
+                                                                  pattern,
+                                                                  q,
+                                                                  p2p,
+                                                                  done);
         // update pattern
         comm->update_rt_pattern(pattern_type::collective, -1, pattern);
         return e;
     };
 
     if (send_size <= ccl::global_data::env().sycl_allgatherv_ll_threshold) {
         // small ring with LL
-        sycl_e = invoke_pcie<Rt64_PCIE>(lambda, comm, dtype);
+        sycl_e = invoke_pcie_type<Rt64_PCIE>(lambda, comm_size, dtype);
     }
     else {
         // simple ring with LL256
-        sycl_e = invoke_pcie<Rt64_128_PCIE>(lambda, comm, dtype);
+        sycl_e = invoke_pcie_type<Rt64_128_PCIE>(lambda, comm_size, dtype);
     }
 
     return ccl::event::create_from_native(sycl_e);
 
@@ -21,19 +21,19 @@
 #include "coll/algorithms/utils/transmit/transmit.hpp"
 
 template <typename T,
-          int NRanks,
           template <typename, int>
           class Proto,
-          template <typename, int, template <typename, int> class, int>
+          template <typename, template <typename, int> class, int>
           class Transmit,
           int SubGroupSize = 16>
-struct AllGather : public Transmit<T, NRanks, Proto, SubGroupSize> {
-    using Super = Transmit<T, NRanks, Proto, SubGroupSize>;
+struct AllGather : public Transmit<T, Proto, SubGroupSize> {
+    using Super = Transmit<T, Proto, SubGroupSize>;
     using message_t = typename Super::message_t;
     constexpr static int wireCapacity = Super::wireCapacity;
     using Super::runAllgather;
 
-    AllGather(T* input,
+    AllGather(int nranks,
+              T* input,
               T* output,
               size_t nelems,
               int rank,
@@ -43,16 +43,17 @@ struct AllGather : public Transmit<T, NRanks, Proto, SubGroupSize> {
               T* const peerBuf0[],
               T* const peerBuf1[],
               bool p2p)
-            : Transmit<T, NRanks, Proto, SubGroupSize>(input,
-                                                       output,
-                                                       scatterBuf,
-                                                       gatherBuf,
-                                                       peerBuf0,
-                                                       peerBuf1,
-                                                       calcWorkSize(input, output, nelems * sizeof(T)),
-                                                       rank,
-                                                       seqNo,
-                                                       p2p),
+            : Transmit<T, Proto, SubGroupSize>(nranks,
+                                               input,
+                                               output,
+                                               scatterBuf,
+                                               gatherBuf,
+                                               peerBuf0,
+                                               peerBuf1,
+                                               calcWorkSize(input, output, nelems * sizeof(T)),
+                                               rank,
+                                               seqNo,
+                                               p2p),
               workSize(calcWorkSize(input, output, nelems * sizeof(T))) {}
 
     sycl::nd_range<1> getLaunchParam(uint32_t& updateSeqNo) const {
@@ -79,7 +80,8 @@ struct AllGather : public Transmit<T, NRanks, Proto, SubGroupSize> {
         return sycl::nd_range<1>(actualSS * wirePerSS * w * SubGroupSize, nThreads * SubGroupSize);
     }
 
-    static sycl::event launch(T* input,
+    static sycl::event launch(int nranks,
+                              T* input,
                               T* output,
                               T* ipcbuf0,
                               T* ipcbuf1,
@@ -92,7 +94,7 @@ struct AllGather : public Transmit<T, NRanks, Proto, SubGroupSize> {
                               bool p2p,
                               bool& done) {
         sycl::event e;
-        AllGather offload(input, output, nelems, rank, step, ipcbuf0, ipcbuf1, peerbuf0, peerbuf1, p2p);
+        AllGather offload(nranks, input, output, nelems, rank, step, ipcbuf0, ipcbuf1, peerbuf0, peerbuf1, p2p);
         if (offload.workSize == 0) {
             done = false;
             return e;
 
@@ -72,47 +72,9 @@ ccl::event allgatherv_scaleout_sycl_direct(sycl::queue& q,
         sycl_deps.push_back(ev);
     }
     else if (!is_cpu_buffers) {
-        auto lib_attr = atl_mpi_ctx::get_lib_attr();
-        if (lib_attr.type == atl_mpi_ctx::ATL_MPI_LIB_IMPI && lib_attr.hmem == 1) {
-            const char* env_val = getenv("I_MPI_OFFLOAD");
-            int offload = 0;
-            if (env_val != nullptr)
-                offload = atoi(env_val);
-
-            if (offload == 0) {
-                LOG_INFO("copy_to_host=false with a GPU buffer. "
-                         "make sure I_MPI_OFFLOAD is set or GPU RDMA is enabled");
-                done = false;
-                ccl::event e;
-                return e;
-            }
-        }
-        else if (lib_attr.type == atl_mpi_ctx::ATL_MPI_LIB_MPICH && lib_attr.hmem == 1) {
-            const char* env_val = getenv("MPIR_CVAR_CH4_OFI_ENABLE_HMEM");
-            int gpu_rdma = 0;
-            if (env_val != nullptr)
-                gpu_rdma = atoi(env_val);
-
-            env_val = getenv("MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE");
-            int gpu_pipeline = 0;
-            if (env_val != nullptr)
-                gpu_pipeline = atoi(env_val);
-
-            if (!gpu_rdma && !gpu_pipeline) {
-                LOG_INFO(
-                    "copy_to_host=false with a GPU buffer. "
-                    "make sure MPIR_CVAR_CH4_OFI_ENABLE_HMEM or MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE are set or GPU RDMA is enabled");
-                done = false;
-                ccl::event e;
-                return e;
-            }
-        }
-        else {
+        if (!check_mpi_supports_rdma()) {
             LOG_INFO("copy_to_host=false with a GPU buffer. "
-                     "no transport with GPU RDMA enabled was detected");
-            done = false;
-            ccl::event e;
-            return e;
+                     "make sure MPI GPU RDMA is enabled");
         }
     }
 
 
@@ -44,7 +44,7 @@ ccl::event allreduce_ll_ring(const void *src,
     bool p2p = node_comm->get_topo_manager().has_p2p_access();
     uint32_t pattern = comm->get_rt_pattern(pattern_type::collective, -1);
 
-    auto lambda = [&]<typename T, int NRanks, template <typename, int> class Proto>() {
+    auto lambda = [&]<typename T, template <typename, int> class Proto>(int NRanks) {
         T *peerbuf0[NRanks];
         T *peerbuf1[NRanks];
         for (int i = 0; i < NRanks; i++) {
@@ -53,28 +53,29 @@ ccl::event allreduce_ll_ring(const void *src,
         }
         T *ipcbuf0 = (T *)get_tmp_buf(0, comm);
         T *ipcbuf1 = (T *)get_tmp_buf(1, comm);
-        sycl::event e = AllReduce<T, NRanks, Proto, RingTransmit>::launch((T *)dst,
-                                                                          ipcbuf0,
-                                                                          ipcbuf1,
-                                                                          peerbuf0,
-                                                                          peerbuf1,
-                                                                          count,
-                                                                          comm_rank,
-                                                                          pattern,
-                                                                          q,
-                                                                          p2p,
-                                                                          done);
+        sycl::event e = AllReduce<T, Proto, RingTransmit>::launch(NRanks,
+                                                                  (T *)dst,
+                                                                  ipcbuf0,
+                                                                  ipcbuf1,
+                                                                  peerbuf0,
+                                                                  peerbuf1,
+                                                                  count,
+                                                                  comm_rank,
+                                                                  pattern,
+                                                                  q,
+                                                                  p2p,
+                                                                  done);
         comm->update_rt_pattern(pattern_type::collective, -1, pattern);
         return e;
     };
 
     if (count * dt_sz <= ccl::global_data::env().sycl_allreduce_ll_threshold) {
         // small ring with LL
-        sycl_e = invoke_pcie<Rt64_PCIE>(lambda, comm, dtype);
+        sycl_e = invoke_pcie_type<Rt64_PCIE>(lambda, comm_size, dtype);
     }
     else {
         // simple ring with LL256
-        sycl_e = invoke_pcie<Rt64_128_PCIE>(lambda, comm, dtype);
+        sycl_e = invoke_pcie_type<Rt64_128_PCIE>(lambda, comm_size, dtype);
     }
 
     if (reduction == ccl::reduction::avg) {
 
@@ -21,18 +21,18 @@
 #include "coll/algorithms/utils/transmit/transmit.hpp"
 
 template <typename T,
-          int NRanks,
           template <typename, int>
           class Proto,
-          template <typename, int, template <typename, int> class, int>
+          template <typename, template <typename, int> class, int>
           class Transmit,
           int SubGroupSize = 16>
-struct AllReduce : public Transmit<T, NRanks, Proto, SubGroupSize> {
-    using Super = Transmit<T, NRanks, Proto, SubGroupSize>;
+struct AllReduce : public Transmit<T, Proto, SubGroupSize> {
+    using Super = Transmit<T, Proto, SubGroupSize>;
     using message_t = typename Super::message_t;
     constexpr static int wireCapacity = Super::wireCapacity;
 
-    AllReduce(T* input,
+    AllReduce(int nranks,
+              T* input,
               size_t nelems,
               int rank,
               uint32_t seqNo,
@@ -41,16 +41,17 @@ struct AllReduce : public Transmit<T, NRanks, Proto, SubGroupSize> {
               T* const peerBuf0[],
               T* const peerBuf1[],
               bool p2p)
-            : Transmit<T, NRanks, Proto, SubGroupSize>(input,
-                                                       scatterBuf,
-                                                       gatherBuf,
-                                                       peerBuf0,
-                                                       peerBuf1,
-                                                       calcWorkSize(input, nelems * sizeof(T)),
-                                                       rank,
-                                                       seqNo,
-                                                       p2p),
-              workSize(calcWorkSize(input, nelems * sizeof(T))) {}
+            : Transmit<T, Proto, SubGroupSize>(nranks,
+                                               input,
+                                               scatterBuf,
+                                               gatherBuf,
+                                               peerBuf0,
+                                               peerBuf1,
+                                               calcWorkSize(input, nelems * sizeof(T), nranks),
+                                               rank,
+                                               seqNo,
+                                               p2p),
+              workSize(calcWorkSize(input, nelems * sizeof(T), nranks)) {}
 
     static int scatterVerify(uint32_t* host, int rank, uint32_t flag, size_t nWorkElemsInInt);
     static int stage2Verify(T* host, int rank, uint32_t flag, size_t nWorkElemsInInt);
@@ -80,7 +81,8 @@ struct AllReduce : public Transmit<T, NRanks, Proto, SubGroupSize> {
         return sycl::nd_range<1>(actualSS * wirePerSS * w * SubGroupSize, nThreads * SubGroupSize);
     }
 
-    static sycl::event launch(T* input,
+    static sycl::event launch(int nranks,
+                              T* input,
                               T* ipcbuf0,
                               T* ipcbuf1,
                               T* const peerbuf0[],
@@ -92,7 +94,8 @@ struct AllReduce : public Transmit<T, NRanks, Proto, SubGroupSize> {
                               bool p2p,
                               bool& done) {
         sycl::event e;
-        AllReduce offload(input, nelems, rank, step, ipcbuf0, ipcbuf1, peerbuf0, peerbuf1, p2p);
+        AllReduce offload(
+            nranks, input, nelems, rank, step, ipcbuf0, ipcbuf1, peerbuf0, peerbuf1, p2p);
         if (offload.workSize == 0) {
             done = false;
             return e;
@@ -134,7 +137,7 @@ struct AllReduce : public Transmit<T, NRanks, Proto, SubGroupSize> {
 
 private:
     // TODO: buffer plan and start point calc
-    static size_t calcWorkSize(T* input, size_t size) {
+    static size_t calcWorkSize(T* input, size_t size, int NRanks) {
         // Input must be message size align
         if ((uintptr_t)input % sizeof(message_t) != 0)
             throw std::logic_error("We only support aligned pointer for now");