Reimplemented map_back_impl to process few elements per work-item

oleksandr-pavlyk · ndgrigorian · commit 2608aea15a7b · 2025-01-04T17:16:11.000-08:00
Factored out map_back_impl projects indexing from flat index to a
row-wise index.

Removed dead code excluded by preprocessor conditional.
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
@@ -816,24 +816,9 @@ sycl::event stable_argsort_axis1_contig_impl(
 
     using IotaKernelName = populate_index_data_krn<argTy, IndexTy, ValueComp>;
 
-#if 1
     sycl::event populate_indexed_data_ev = iota_impl<IotaKernelName, IndexTy>(
         exec_q, res_tp, total_nelems, depends);
 
-#else
-    sycl::event populate_indexed_data_ev =
-        exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(depends);
-
-            const sycl::range<1> range{total_nelems};
-
-            cgh.parallel_for<IotaKernelName>(range, [=](sycl::id<1> id) {
-                size_t i = id[0];
-                res_tp[i] = static_cast<IndexTy>(i);
-            });
-        });
-#endif
-
     // Sort segments of the array
     sycl::event base_sort_ev =
         merge_sort_detail::sort_over_work_group_contig_impl(
@@ -847,21 +832,11 @@ sycl::event stable_argsort_axis1_contig_impl(
         exec_q, iter_nelems, sort_nelems, res_tp, index_comp, sorted_block_size,
         {base_sort_ev});
 
-    sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(merges_ev);
-
-        auto temp_acc =
-            merge_sort_detail::GetReadOnlyAccess<decltype(res_tp)>{}(res_tp,
-                                                                     cgh);
+    using MapBackKernelName = index_map_to_rows_krn<argTy, IndexTy, ValueComp>;
+    using dpctl::tensor::kernels::sort_utils_detail::map_back_impl;
 
-        using KernelName = index_map_to_rows_krn<argTy, IndexTy, ValueComp>;
-
-        const sycl::range<1> range{total_nelems};
-
-        cgh.parallel_for<KernelName>(range, [=](sycl::id<1> id) {
-            res_tp[id] = (temp_acc[id] % sort_nelems);
-        });
-    });
+    sycl::event write_out_ev = map_back_impl<MapBackKernelName, IndexTy>(
+        exec_q, total_nelems, res_tp, res_tp, sort_nelems, {merges_ev});
 
     return write_out_ev;
 }
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
@@ -1766,41 +1766,21 @@ radix_argsort_axis1_contig_impl(sycl::queue &exec_q,
 
     using IotaKernelName = radix_argsort_iota_krn<argTy, IndexTy>;
 
-#if 1
     using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
 
     sycl::event iota_ev = iota_impl<IotaKernelName, IndexTy>(
         exec_q, workspace, total_nelems, depends);
-#else
-
-    sycl::event iota_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        cgh.parallel_for<IotaKernelName>(
-            sycl::range<1>(total_nelems), [=](sycl::id<1> id) {
-                size_t i = id[0];
-                IndexTy sort_id = static_cast<IndexTy>(i);
-                workspace[i] = sort_id;
-            });
-    });
-#endif
 
     sycl::event radix_sort_ev =
         radix_sort_details::parallel_radix_sort_impl<IndexTy, IndexedProjT>(
             exec_q, iter_nelems, sort_nelems, workspace, res_tp, proj_op,
             sort_ascending, {iota_ev});
 
-    sycl::event map_back_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(radix_sort_ev);
-
-        using KernelName = radix_argsort_index_write_out_krn<argTy, IndexTy>;
+    using MapBackKernelName = radix_argsort_index_write_out_krn<argTy, IndexTy>;
+    using dpctl::tensor::kernels::sort_utils_detail::map_back_impl;
 
-        cgh.parallel_for<KernelName>(
-            sycl::range<1>(total_nelems), [=](sycl::id<1> id) {
-                IndexTy linear_index = res_tp[id];
-                res_tp[id] = (linear_index % sort_nelems);
-            });
-    });
+    sycl::event map_back_ev = map_back_impl<MapBackKernelName, IndexTy>(
+        exec_q, total_nelems, res_tp, res_tp, sort_nelems, {radix_sort_ev});
 
     sycl::event cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
         exec_q, {map_back_ev}, workspace_owner);
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
@@ -97,6 +97,27 @@ sycl::event iota_impl(sycl::queue &exec_q,
     return e;
 }
 
+template <class KernelName, typename IndexTy>
+sycl::event map_back_impl(sycl::queue &exec_q,
+                          std::size_t nelems,
+                          const IndexTy *flat_index_data,
+                          IndexTy *reduced_index_data,
+                          std::size_t row_size,
+                          const std::vector<sycl::event> &dependent_events)
+{
+    sycl::event map_back_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(dependent_events);
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(nelems), [=](sycl::id<1> id) {
+                const IndexTy linear_index = flat_index_data[id];
+                reduced_index_data[id] = (linear_index % row_size);
+            });
+    });
+
+    return map_back_ev;
+}
+
 } // end of namespace sort_utils_detail
 } // end of namespace kernels
 } // end of namespace tensor
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
@@ -83,23 +83,50 @@ sycl::event write_out_impl(sycl::queue &exec_q,
                            IndexTy *inds_tp,
                            const std::vector<sycl::event> &depends)
 {
-    sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        cgh.parallel_for<KernelName>(iter_nelems * k, [=](sycl::id<1> id) {
-            const std::size_t gid = id[0];
+    constexpr std::uint32_t lws = 64;
+    constexpr std::uint32_t n_wi = 4;
+    const std::size_t nelems = iter_nelems * k;
+    const std::size_t n_groups = (nelems + lws * n_wi - 1) / (n_wi * lws);
 
-            const std::size_t iter_gid = gid / k;
-            const std::size_t axis_gid = gid - (iter_gid * k);
+    sycl::range<1> lRange{lws};
+    sycl::range<1> gRange{n_groups * lws};
+    sycl::nd_range<1> ndRange{gRange, lRange};
 
-            const std::size_t src_idx = iter_gid * iter_index_stride + axis_gid;
-            const std::size_t dst_idx = gid;
-
-            const IndexTy res_ind = index_data[src_idx];
-            const argTy v = arg_tp[res_ind];
+    sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
 
-            vals_tp[dst_idx] = v;
-            inds_tp[dst_idx] = (res_ind % axis_nelems);
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> it) {
+            const std::size_t gid = it.get_global_linear_id();
+            const auto &sg = it.get_sub_group();
+            const std::uint32_t lane_id = sg.get_local_id()[0];
+            const std::uint32_t sg_size = sg.get_max_local_range()[0];
+
+            const std::size_t start_id =
+                (gid - lane_id) * sg_size * n_wi + lane_id;
+
+#pragma unroll
+            for (std::uint32_t i = 0; i < n_wi; ++i) {
+                const std::size_t data_id = start_id + i * sg_size;
+
+                if (data_id < nelems) {
+                    const std::size_t iter_id = data_id / k;
+
+                    /*
+                    const std::size_t axis_gid = data_id - (iter_gid * k);
+                    const std::size_t src_idx = iter_gid * iter_index_stride +
+                    axis_gid;
+                    */
+                    const std::size_t src_idx =
+                        data_id + iter_id * (iter_index_stride - k);
+
+                    const IndexTy res_ind = index_data[src_idx];
+                    const argTy v = arg_tp[res_ind];
+
+                    const std::size_t dst_idx = data_id;
+                    vals_tp[dst_idx] = v;
+                    inds_tp[dst_idx] = (res_ind % axis_nelems);
+                }
+            }
         });
     });