uxlfoundation
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎man/doxconfig‎
Lines changed: 1 addition & 1 deletion b/‎man/doxconfig‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/atl/ofi/atl_ofi_comm.cpp‎
Lines changed: 57 additions & 0 deletions b/‎src/atl/ofi/atl_ofi_comm.cpp‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎src/atl/ofi/atl_ofi_comm.hpp‎
Lines changed: 1 addition & 3 deletions b/‎src/atl/ofi/atl_ofi_comm.hpp‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎src/coll/algorithms/allgatherv/sycl/allgatherv_large_sycl.cpp‎
Lines changed: 36 additions & 18 deletions b/‎src/coll/algorithms/allgatherv/sycl/allgatherv_large_sycl.cpp‎
Lines changed: 36 additions & 18 deletions
diff --git a/‎src/coll/algorithms/allgatherv/sycl/allgatherv_large_sycl_impl.hpp‎
Lines changed: 33 additions & 0 deletions b/‎src/coll/algorithms/allgatherv/sycl/allgatherv_large_sycl_impl.hpp‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎src/coll/algorithms/allgatherv/sycl/allgatherv_pcie.cpp‎
Lines changed: 3 additions & 0 deletions b/‎src/coll/algorithms/allgatherv/sycl/allgatherv_pcie.cpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/coll/algorithms/allgatherv/sycl/allgatherv_pcie.hpp‎
Lines changed: 5 additions & 1 deletion b/‎src/coll/algorithms/allgatherv/sycl/allgatherv_pcie.hpp‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/coll/algorithms/allgatherv/sycl/allgatherv_sycl.cpp‎
Lines changed: 58 additions & 7 deletions b/‎src/coll/algorithms/allgatherv/sycl/allgatherv_sycl.cpp‎
Lines changed: 58 additions & 7 deletions
diff --git a/‎src/coll/algorithms/allreduce/sycl/allreduce_ring_ll256.cpp‎
Lines changed: 3 additions & 0 deletions b/‎src/coll/algorithms/allreduce/sycl/allreduce_ring_ll256.cpp‎
Lines changed: 3 additions & 0 deletions
@@ -335,7 +335,7 @@ endif()
 
 set(CCL_MAJOR_VERSION     "2021")
 set(CCL_MINOR_VERSION     "15")
-set(CCL_UPDATE_VERSION    "5")
+set(CCL_UPDATE_VERSION    "6")
 set(CCL_PRODUCT_STATUS    "Gold")
 string(TIMESTAMP CCL_PRODUCT_BUILD_DATE "%Y-%m-%dT %H:%M:%SZ")
 get_vcs_properties("git")
 
@@ -1,5 +1,5 @@
 PROJECT_NAME           = "Intel® oneAPI Collective Communications Library"
-PROJECT_NUMBER         = "2021.15.5"
+PROJECT_NUMBER         = "2021.15.6"
 
 INPUT = ../src/common/env/vars.hpp ../src/common/env/vars_experimental.hpp
 
 
@@ -45,6 +45,63 @@ atl_ofi_comm::atl_ofi_comm(int comm_size,
     CCL_THROW_IF_NOT(init_transport(true) == ATL_STATUS_SUCCESS, "init transport failed");
 }
 
+atl_status_t atl_ofi_comm::barrier(size_t ep_idx, atl_req_t& req) {
+    ssize_t ret = ATL_STATUS_SUCCESS;
+
+    req.is_completed = false;
+    atl_ofi_req_t* ofi_req = ((atl_ofi_req_t*)req.internal);
+
+    if (size == 1) {
+        ofi_req->comp_state = ATL_OFI_COMP_COMPLETED;
+        return ATL_STATUS_SUCCESS;
+    }
+
+    int tag_comm_id = (comm_id != atl_comm_id_storage::invalid_comm_id)
+                          ? comm_id
+                          : atl_comm_id_storage::max_comm_id;
+    int tagc = tag_counter++;
+    int src, dst;
+    const int len = 1;
+    char sendbuf[len], recvbuf[len];
+    int mask = 0x1;
+    while (mask < size) {
+        dst = (rank + mask) % size;
+        src = (rank - mask + size) % size;
+        atl_req send_req, recv_req;
+        uint64_t op_tag = tag_creator->create(rank, tag_comm_id, tagc, 1);
+        do {
+            ret = send(ep_idx, sendbuf, len, dst, op_tag, send_req);
+            CCL_THROW_IF_NOT(ret != ATL_STATUS_FAILURE, "send failed");
+            if (ret == ATL_STATUS_AGAIN) {
+                ccl_yield(ccl::global_data::env().yield_type);
+            }
+        } while (ret == ATL_STATUS_AGAIN);
+        op_tag = tag_creator->create(src, tag_comm_id, tagc, 1);
+        do {
+            ret = recv(ep_idx, recvbuf, len, src, op_tag, recv_req);
+            CCL_THROW_IF_NOT(ret != ATL_STATUS_FAILURE, "recv failed");
+            if (ret == ATL_STATUS_AGAIN) {
+                ccl_yield(ccl::global_data::env().yield_type);
+            }
+        } while (ret == ATL_STATUS_AGAIN);
+        while (!send_req.is_completed || !recv_req.is_completed) {
+            poll(ep_idx);
+            if (!send_req.is_completed) {
+                CCL_THROW_IF_NOT(check(ep_idx, send_req) != ATL_STATUS_FAILURE,
+                                 "check send failed");
+            }
+            if (!recv_req.is_completed) {
+                CCL_THROW_IF_NOT(check(ep_idx, recv_req) != ATL_STATUS_FAILURE,
+                                 "check recv failed");
+            }
+        }
+        mask <<= 1;
+    }
+
+    ofi_req->comp_state = ATL_OFI_COMP_COMPLETED;
+    return ATL_STATUS_SUCCESS;
+}
+
 atl_status_t atl_ofi_comm::allgatherv(size_t ep_idx,
                                       const void* send_buf,
                                       size_t send_len,
 
@@ -118,9 +118,7 @@ class atl_ofi_comm : public atl_base_comm {
         return ATL_STATUS_UNSUPPORTED;
     }
 
-    atl_status_t barrier(size_t ep_idx, atl_req_t& req) override {
-        return ATL_STATUS_UNSUPPORTED;
-    }
+    atl_status_t barrier(size_t ep_idx, atl_req_t& req) override;
 
     atl_status_t bcast(size_t ep_idx, void* buf, size_t len, int root, atl_req_t& req) override {
         return ATL_STATUS_UNSUPPORTED;
 
@@ -70,27 +70,45 @@ ccl::event allgatherv_large(const void* send_buf,
                 dep.wait();
             }
         }
-        std::vector<void*> ptrs{ (void*)send_buf, recv_buf }; // index 0 and 1
-        auto [sched, exchange_entry] = do_ipc_exchange(comm, global_stream, ptrs);
 
-        sycl_ptrs.xelink_ptrs_rd = get_ipc_ptrs<void, MAX_GPUS>(even_comm, 0, (void*)send_buf, sched);
-        sycl_ptrs.xelink_ptrs_wr = get_ipc_ptrs<void, MAX_GPUS>(even_comm, 1, recv_buf, sched);
-        // use full vector (>= 8 bytes) if remote buffers and data size are 4 byte aligned
-        use_full_vector = use_full_vector &&
-                          all_aligned(sycl_ptrs.xelink_ptrs_rd.data(), even_comm->size(), send_count * dsize, 4) &&
-                          all_aligned(sycl_ptrs.xelink_ptrs_wr.data(), even_comm->size(), send_count * dsize, 4);
+        if (is_arc_card(ccl::ze::get_device_family(global_stream->get_ze_device()))) {
+            // only need output buffer
+            std::vector<void*> ptrs{ recv_buf }; // index 0
+            auto [sched, exchange_entry] = do_ipc_exchange(comm, global_stream, ptrs);
 
-        if (pair_comm->size() > 1) {
-            assert(pair_comm->size() == MAX_TILES);
-            int peer_pair_rank = pair_comm->rank() ? 0 : 1;
-            sycl_ptrs.mdfi_ptr_rd =
-                get_ipc_ptrs<void, MAX_TILES>(pair_comm, 0, (void*)send_buf, sched)[peer_pair_rank];
-            sycl_ptrs.mdfi_ptr_wr = get_ipc_ptrs<void, MAX_TILES>(pair_comm, 1, recv_buf, sched)[peer_pair_rank];
-            use_full_vector = use_full_vector && all_aligned(&sycl_ptrs.mdfi_ptr_rd, 1, send_count * dsize, 4) &&
-                              all_aligned(&sycl_ptrs.mdfi_ptr_wr, 1, send_count * dsize, 4);
+            std::shared_ptr<ccl_comm> node_comm = comm->get_node_comm();
+            sycl_ptrs.node_ptrs_wr = get_ipc_ptrs<void, MAX_NODE_RANKS>(node_comm, 0, recv_buf, sched);
+
+            delete exchange_entry;
+            delete sched;
+        }
+        else {
+            std::vector<void*> ptrs{ (void*)send_buf, recv_buf }; // index 0 and 1
+            auto [sched, exchange_entry] = do_ipc_exchange(comm, global_stream, ptrs);
+
+            sycl_ptrs.xelink_ptrs_rd = get_ipc_ptrs<void, MAX_GPUS>(even_comm, 0, (void*)send_buf, sched);
+            sycl_ptrs.xelink_ptrs_wr = get_ipc_ptrs<void, MAX_GPUS>(even_comm, 1, recv_buf, sched);
+            // use full vector (>= 8 bytes) if remote buffers and data size are 4 byte aligned
+            use_full_vector =
+                use_full_vector &&
+                all_aligned(sycl_ptrs.xelink_ptrs_rd.data(), even_comm->size(), send_count * dsize, 4) &&
+                all_aligned(sycl_ptrs.xelink_ptrs_wr.data(), even_comm->size(), send_count * dsize, 4);
+
+            if (pair_comm->size() > 1) {
+                assert(pair_comm->size() == MAX_TILES);
+                int peer_pair_rank = pair_comm->rank() ? 0 : 1;
+                sycl_ptrs.mdfi_ptr_rd =
+                    get_ipc_ptrs<void, MAX_TILES>(pair_comm, 0, (void*)send_buf, sched)[peer_pair_rank];
+                sycl_ptrs.mdfi_ptr_wr =
+                    get_ipc_ptrs<void, MAX_TILES>(pair_comm, 1, recv_buf, sched)[peer_pair_rank];
+                use_full_vector = use_full_vector &&
+                                  all_aligned(&sycl_ptrs.mdfi_ptr_rd, 1, send_count * dsize, 4) &&
+                                  all_aligned(&sycl_ptrs.mdfi_ptr_wr, 1, send_count * dsize, 4);
+            }
+
+            delete exchange_entry;
+            delete sched;
         }
-        delete exchange_entry;
-        delete sched;
 
         //        coll_init(comm, global_stream);
     }
 
@@ -214,6 +214,39 @@ ccl::event allgatherv_large_impl_ipc(const void* send_buf,
 
     std::vector<sycl::event> dep_events = get_sycl_events(deps);
 
+    if (is_arc_card(ccl::ze::get_device_family(global_stream->get_ze_device()))) {
+        sycl::event kernel_event;
+
+        sycl::event barrier_event1 = invoke_barrier(node_comm, q, dep_events, is_cpu_barrier);
+
+        int rank = comm->rank();
+        for (int i = 0; i < N; i++) {
+            // scatter the ranks and limit the amount to copy
+            int peer = (rank + i) % N;
+            // limit amount of write due to crash in KMD (read timeout error)
+            const size_t max_chunk = 512 * 1024 * 1024;
+            size_t left = send_count * dsize;
+            size_t offset = 0;
+            while (left > 0) {
+                size_t chunk = left > max_chunk ? max_chunk : left;
+                kernel_event = q.submit([=](sycl::handler& h) {
+                    h.depends_on(barrier_event1);
+                    h.memcpy(((char*)sycl_ptrs.node_ptrs_wr[peer] + rank * send_count * dsize) + offset,
+                             (char*)send_buf + offset,
+                             chunk);
+                });
+                left -= chunk;
+                offset += chunk;
+                // skip the barrier for the very last iterations
+                if (i < N - 1 || left > 0)
+                    kernel_event = invoke_barrier(node_comm, q, { kernel_event }, is_cpu_barrier);
+            }
+        }
+
+        kernel_event = invoke_barrier(node_comm, q, { kernel_event }, is_cpu_barrier);
+        return ccl::event::create_from_native(kernel_event);
+    }
+
     std::array<void*, MAX_GPUS> local_peer_even_ptrs, local_local_ptrs, local_peer_pair_ptrs;
     for (int i = 0; i < even_comm->size(); i++) {
         // offsets for read_write kernel
 
@@ -46,6 +46,8 @@ ccl::event allgatherv_ll_ring(const void *send_buf,
     uint32_t pattern = comm->get_rt_pattern(pattern_type::collective, -1);
 
     auto lambda = [&]<typename T, template <typename, int> class Proto>(int NRanks) {
+        const size_t *offs = offsets.empty() ? NULL : offsets.data();
+
         T *peerbuf0[NRanks];
         T *peerbuf1[NRanks];
         for (int i = 0; i < NRanks; i++) {
@@ -57,6 +59,7 @@ ccl::event allgatherv_ll_ring(const void *send_buf,
         sycl::event e = AllGather<T, Proto, RingTransmit>::launch(NRanks,
                                                                   (T *)send_buf,
                                                                   (T *)recv_buf,
+                                                                  offs,
                                                                   ipcbuf0,
                                                                   ipcbuf1,
                                                                   peerbuf0,
 
@@ -35,6 +35,7 @@ struct AllGather : public Transmit<T, Proto, SubGroupSize> {
     AllGather(int nranks,
               T* input,
               T* output,
+              const size_t* offsets,
               size_t nelems,
               int rank,
               uint32_t seqNo,
@@ -46,6 +47,7 @@ struct AllGather : public Transmit<T, Proto, SubGroupSize> {
             : Transmit<T, Proto, SubGroupSize>(nranks,
                                                input,
                                                output,
+                                               offsets,
                                                scatterBuf,
                                                gatherBuf,
                                                peerBuf0,
@@ -83,6 +85,7 @@ struct AllGather : public Transmit<T, Proto, SubGroupSize> {
     static sycl::event launch(int nranks,
                               T* input,
                               T* output,
+                              const size_t* offsets,
                               T* ipcbuf0,
                               T* ipcbuf1,
                               T* const peerbuf0[],
@@ -94,7 +97,8 @@ struct AllGather : public Transmit<T, Proto, SubGroupSize> {
                               bool p2p,
                               bool& done) {
         sycl::event e;
-        AllGather offload(nranks, input, output, nelems, rank, step, ipcbuf0, ipcbuf1, peerbuf0, peerbuf1, p2p);
+        AllGather offload(
+            nranks, input, output, offsets, nelems, rank, step, ipcbuf0, ipcbuf1, peerbuf0, peerbuf1, p2p);
         if (offload.workSize == 0) {
             done = false;
             return e;
 
@@ -76,11 +76,58 @@ ccl::event allgather_sycl_single_node(sycl::queue& q,
             done = false;
             return e;
         }
-        LOG_DEBUG("invoking allgatherv LL256 kernel, send_count:", send_count, " datatype: ", dtype);
-        e = allgatherv_ll_ring(
-            send_buf, send_count, recv_buf, recv_counts, offsets, dtype, comm, global_stream, deps, done);
-        LOG_DEBUG("invoking allgatherv LL256 kernel, count:", send_count, " datatype: ", dtype, " done");
-        return e;
+        if (send_count * ccl_dtype.size() < 256 * 1024 || !ccl::global_data::env().sycl_ccl_barrier ||
+            ccl::global_data::env().sycl_allgatherv_tmp_buf) {
+            int node_size = comm->size();
+            const int chunk_size = ccl::global_data::env().sycl_allgatherv_chunking_threshold;
+            size_t max_pack_count;
+            if (chunk_size == 0 || send_count * ccl_dtype.size() <= chunk_size) {
+                max_pack_count = send_count;
+            }
+            else {
+                max_pack_count = chunk_size;
+                int typesize = std::max(4, (int)ccl_dtype.size());
+                max_pack_count = max_pack_count / typesize * typesize;
+                max_pack_count = max_pack_count / ccl_dtype.size();
+                CCL_ASSERT(max_pack_count > 0);
+            }
+
+            int send_offset = 0;
+            int nchunks = (send_count + max_pack_count - 1) / max_pack_count;
+            for (int iter = 0; iter < nchunks; iter++) {
+                int pack_count = (iter < nchunks - 1) ? max_pack_count : send_count - send_offset;
+                std::vector<size_t> scaleup_counts(node_size, pack_count);
+                std::vector<size_t> scaleup_offsets(node_size);
+                for (int r = 0; r < node_size; r++) {
+                    scaleup_offsets[r] = (offsets.empty() ? r * send_count * ccl_dtype.size() : offsets[r]) +
+                                         send_offset * ccl_dtype.size();
+                }
+#ifdef CCL_ENABLE_ITT
+                ccl::profile::itt::task_begin("allgatherv_small", "send_size", pack_count * ccl_dtype.size());
+#endif // CCL_ENABLE_ITT
+                LOG_DEBUG("invoking allgatherv LL256 kernel, send_count:", pack_count, " datatype: ", dtype);
+                e = allgatherv_ll_ring((char*)send_buf + send_offset * ccl_dtype.size(),
+                                       pack_count,
+                                       recv_buf,
+                                       scaleup_counts,
+                                       scaleup_offsets,
+                                       dtype,
+                                       comm,
+                                       global_stream,
+                                       deps,
+                                       done);
+                LOG_DEBUG("invoking allgatherv LL256 kernel, count:", pack_count, " datatype: ", dtype, " done");
+#ifdef CCL_ENABLE_ITT
+                ccl::profile::itt::task_end();
+#endif // CCL_ENABLE_ITT
+                send_offset += pack_count;
+            } // for
+            return e;
+        }
+        CCL_THROW_IF_NOT(ccl::global_data::env().sycl_ccl_barrier,
+                         "To run on BMG, CCL_SYCL_CCL_BARRIER must be set to 1");
+        CCL_THROW_IF_NOT(ccl::global_data::env().sycl_allgatherv_tmp_buf == 0,
+                         "To run on BMG, CCL_SYCL_ALLGATHERV_TMP_BUF must be set to 0");
     }
 
     if (!ccl::global_data::env().sycl_esimd) {
@@ -281,8 +328,12 @@ ccl::event allgatherv_sycl_multi_node(sycl::queue& q,
         {
             std::vector<size_t> scaleup_counts(node_size, pack_count);
             for (int i = 0; i < r2r_size; i++) {
-                std::vector<size_t> scaleup_offsets(global_offsets.begin() + i * node_size,
-                                                    global_offsets.begin() + (i + 1) * node_size);
+                std::vector<size_t> scaleup_offsets(node_size);
+                for (int r = 0; r < node_size; r++) {
+                    const int global_rank = r + i * node_size;
+                    scaleup_offsets[r] = (send_count * global_rank + send_offset) * ccl_dtype.size();
+                }
+
                 ev = allgather_sycl_single_node(q,
                                                 (char*)(scaleout_buf) + scaleout_offsets[i],
                                                 recv_scaleout_counts[i],
 
@@ -175,6 +175,9 @@ sycl::event arc_ll256_allreduce(const void *src,
     /* To avoid pattern not changed when "iters" is 1 */
     pattern_t pattern_prefix = ++pattern_counter << 16;
 
+    size_t persist_buf_size = ccl::global_data::env().sycl_tmp_buf_size / 3;
+    const int GATHER_BUF_OFFSET = persist_buf_size / 2;
+
     sycl_e = q.submit([&](auto &h) {
         //using namespace sycl::ext::intel::experimental::esimd;
Original file line number	Diff line number	Diff line change
`@@ -118,9 +118,7 @@ class atl_ofi_comm : public atl_base_comm {`
`118`	`118`	`return ATL_STATUS_UNSUPPORTED;`
`119`	`119`	`}`
`120`	`120`
`121`		`- atl_status_t barrier(size_t ep_idx, atl_req_t& req) override {`
`122`		`- return ATL_STATUS_UNSUPPORTED;`
`123`		`- }`
	`121`	`+ atl_status_t barrier(size_t ep_idx, atl_req_t& req) override;`
`124`	`122`
`125`	`123`	`atl_status_t bcast(size_t ep_idx, void* buf, size_t len, int root, atl_req_t& req) override {`
`126`	`124`	`return ATL_STATUS_UNSUPPORTED;`