From bde928e25ef2a8ed7d61285dfcf8d97d830a4945 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Thu, 18 Jan 2024 17:46:33 +0530 Subject: [PATCH 01/65] load_helpers --- .../dpct/dpl_extras/dpcpp_extensions.h | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 84ef47773acc..600ffa81a824 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -564,6 +564,38 @@ class radix_sort { uint8_t *_local_memory; }; +/// Load linear segment items into block format across threads +/// Helper for Block Load +template +void load_blocked(int linear_tid, InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD]) { +#pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { + + if ((linear_tid * ITEMS_PER_THREAD) + ITEM < GROUP_THREADS){ + + items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM]; + + } + } +} + +template +void load_striped(int linear_tid, InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD]) { +#pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { + + if((linear_tid + (ITEM * ITEMS_PER_THREAD) < GROUP_THREADS){ + + items[ITEM] = block_itr[linear_tid + (ITEM * ITEMS_PER_THREAD)]; + + } + } +} + /// Perform a reduction of the data elements assigned to all threads in the /// group. /// From 8717079461678aef90bc5a14848fa5bad8d1b133 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 29 Jan 2024 20:38:24 +0530 Subject: [PATCH 02/65] add load apis --- .../dpct/dpl_extras/dpcpp_extensions.h | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 600ffa81a824..eef506db10f5 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -568,7 +568,7 @@ class radix_sort { /// Helper for Block Load template -void load_blocked(int linear_tid, InputIteratorT block_itr, +__dpct_inline__ void load_blocked(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { @@ -583,7 +583,7 @@ void load_blocked(int linear_tid, InputIteratorT block_itr, template -void load_striped(int linear_tid, InputIteratorT block_itr, +__dpct_inline__ void load_striped(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { @@ -596,6 +596,22 @@ void load_striped(int linear_tid, InputIteratorT block_itr, } } +template +__dpct_inline__ void load_warp_striped(int linear_tid, InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD]) { + + int tid = linear_tid & 1; + int wid = linear_tid >> 1; + int warp_offset = wid * ITEMS_PER_THREAD; +#pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { + + new(&items[ITEM]) InputT(block_itr[warp_offset + tid + (ITEM * 1)]); + + } +} + /// Perform a reduction of the data elements assigned to all threads in the /// group. /// From 154964205a8a117a272901bf9029bcd49cf0be46 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 29 Jan 2024 21:05:07 +0530 Subject: [PATCH 03/65] clang-format --- .../dpct/dpl_extras/dpcpp_extensions.h | 33 +++++++++---------- 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index eef506db10f5..a69c0f195e06 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -565,50 +565,47 @@ class radix_sort { }; /// Load linear segment items into block format across threads -/// Helper for Block Load +/// Helper for Block Load template __dpct_inline__ void load_blocked(int linear_tid, InputIteratorT block_itr, - InputT (&items)[ITEMS_PER_THREAD]) { + InputT (&items)[ITEMS_PER_THREAD]) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - - if ((linear_tid * ITEMS_PER_THREAD) + ITEM < GROUP_THREADS){ - - items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM]; - - } + + if ((linear_tid * ITEMS_PER_THREAD) + ITEM < GROUP_THREADS) { + + items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM]; + } } } template __dpct_inline__ void load_striped(int linear_tid, InputIteratorT block_itr, - InputT (&items)[ITEMS_PER_THREAD]) { + InputT (&items)[ITEMS_PER_THREAD]) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if((linear_tid + (ITEM * ITEMS_PER_THREAD) < GROUP_THREADS){ - items[ITEM] = block_itr[linear_tid + (ITEM * ITEMS_PER_THREAD)]; - } + } } } template __dpct_inline__ void load_warp_striped(int linear_tid, InputIteratorT block_itr, - InputT (&items)[ITEMS_PER_THREAD]) { + InputT (&items)[ITEMS_PER_THREAD]) { - int tid = linear_tid & 1; - int wid = linear_tid >> 1; - int warp_offset = wid * ITEMS_PER_THREAD; + int tid = linear_tid & 1; + int wid = linear_tid >> 1; + int warp_offset = wid * ITEMS_PER_THREAD; #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - - new(&items[ITEM]) InputT(block_itr[warp_offset + tid + (ITEM * 1)]); - + + new (&items[ITEM]) InputT(block_itr[warp_offset + tid + (ITEM * 1)]); } } From 41d994e5ad812b059b1f0ecd188f58ff45f6d7b5 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Tue, 30 Jan 2024 09:06:39 +0530 Subject: [PATCH 04/65] reviews 1 --- .../include/dpct/dpl_extras/dpcpp_extensions.h | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index a69c0f195e06..a9c9edd01196 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -568,13 +568,11 @@ class radix_sort { /// Helper for Block Load template -__dpct_inline__ void load_blocked(int linear_tid, InputIteratorT block_itr, +__dpct_inline__ void load_blocked(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if ((linear_tid * ITEMS_PER_THREAD) + ITEM < GROUP_THREADS) { - items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM]; } } @@ -582,21 +580,19 @@ __dpct_inline__ void load_blocked(int linear_tid, InputIteratorT block_itr, template -__dpct_inline__ void load_striped(int linear_tid, InputIteratorT block_itr, +__dpct_inline__ void load_striped(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if((linear_tid + (ITEM * ITEMS_PER_THREAD) < GROUP_THREADS){ items[ITEM] = block_itr[linear_tid + (ITEM * ITEMS_PER_THREAD)]; - } } } template -__dpct_inline__ void load_warp_striped(int linear_tid, InputIteratorT block_itr, +__dpct_inline__ void load_warp_striped(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { int tid = linear_tid & 1; @@ -604,7 +600,6 @@ __dpct_inline__ void load_warp_striped(int linear_tid, InputIteratorT block_itr, int warp_offset = wid * ITEMS_PER_THREAD; #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - new (&items[ITEM]) InputT(block_itr[warp_offset + tid + (ITEM * 1)]); } } From 99df7d92dd8617386b8f24382ef18a8c890a4deb Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Tue, 30 Jan 2024 12:31:29 +0530 Subject: [PATCH 05/65] fix lit --- .../runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index a9c9edd01196..9cb13df31454 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -584,7 +584,7 @@ __dpct_inline__ void load_striped(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if((linear_tid + (ITEM * ITEMS_PER_THREAD) < GROUP_THREADS){ + if(linear_tid + (ITEM * ITEMS_PER_THREAD) < GROUP_THREADS){ items[ITEM] = block_itr[linear_tid + (ITEM * ITEMS_PER_THREAD)]; } } From 789bd18cff1aa46f848eafbc22aad2ee289887ab Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Tue, 30 Jan 2024 15:29:43 +0530 Subject: [PATCH 06/65] update warp striped logic --- .../dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 9cb13df31454..bb564ecb65e2 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -595,12 +595,12 @@ template > 1; - int warp_offset = wid * ITEMS_PER_THREAD; + //int tid = linear_tid & 1; + //int wid = linear_tid >> 5; + //int warp_offset = wid * ITEMS_PER_THREAD; #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - new (&items[ITEM]) InputT(block_itr[warp_offset + tid + (ITEM * 1)]); + new (&items[ITEM]) InputT(block_itr[&items[ITEM].get_sub_group().get_local_range()[0]] + linear_tid + (ITEM * ITEMS_PER_THREAD)]); } } From 7872841636a77e4de5d4aeec43282417b8e87a4b Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Tue, 30 Jan 2024 15:30:42 +0530 Subject: [PATCH 07/65] rename warp to subgroup --- .../runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index bb564ecb65e2..677adc2de648 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -592,7 +592,7 @@ __dpct_inline__ void load_striped(size_t linear_tid, InputIteratorT block_itr, template -__dpct_inline__ void load_warp_striped(size_t linear_tid, InputIteratorT block_itr, +__dpct_inline__ void load_subgroup_striped(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { //int tid = linear_tid & 1; From ec7a71876458256ee9fe8b0e68586f0ba0a97c26 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Tue, 30 Jan 2024 15:34:12 +0530 Subject: [PATCH 08/65] clang-format --- .../include/dpct/dpl_extras/dpcpp_extensions.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 677adc2de648..d9191b08da61 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -584,7 +584,7 @@ __dpct_inline__ void load_striped(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if(linear_tid + (ITEM * ITEMS_PER_THREAD) < GROUP_THREADS){ + if (linear_tid + (ITEM * ITEMS_PER_THREAD) < GROUP_THREADS) { items[ITEM] = block_itr[linear_tid + (ITEM * ITEMS_PER_THREAD)]; } } @@ -592,12 +592,13 @@ __dpct_inline__ void load_striped(size_t linear_tid, InputIteratorT block_itr, template -__dpct_inline__ void load_subgroup_striped(size_t linear_tid, InputIteratorT block_itr, - InputT (&items)[ITEMS_PER_THREAD]) { +__dpct_inline__ void load_subgroup_striped(size_t linear_tid, + InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD]) { - //int tid = linear_tid & 1; - //int wid = linear_tid >> 5; - //int warp_offset = wid * ITEMS_PER_THREAD; + // int tid = linear_tid & 1; + // int wid = linear_tid >> 5; + // int warp_offset = wid * ITEMS_PER_THREAD; #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { new (&items[ITEM]) InputT(block_itr[&items[ITEM].get_sub_group().get_local_range()[0]] + linear_tid + (ITEM * ITEMS_PER_THREAD)]); From fe4c38ed871721c3a56598c55ab2f45f8f6a5089 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Wed, 31 Jan 2024 13:47:17 +0530 Subject: [PATCH 09/65] review commits 2 --- .../include/dpct/dpl_extras/dpcpp_extensions.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index d9191b08da61..2c0be94ece06 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -566,33 +566,34 @@ class radix_sort { /// Load linear segment items into block format across threads /// Helper for Block Load -template __dpct_inline__ void load_blocked(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if ((linear_tid * ITEMS_PER_THREAD) + ITEM < GROUP_THREADS) { + if ((linear_tid * ITEMS_PER_THREAD) + ITEM < GROUP_WORK_ITEMS) { items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM]; } } } -template __dpct_inline__ void load_striped(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if (linear_tid + (ITEM * ITEMS_PER_THREAD) < GROUP_THREADS) { + if (linear_tid + (ITEM * ITEMS_PER_THREAD) < GROUP_WORK_ITEMS) { items[ITEM] = block_itr[linear_tid + (ITEM * ITEMS_PER_THREAD)]; } } } -template -__dpct_inline__ void load_subgroup_striped(size_t linear_tid, +__dpct_inline__ void load_subgroup_striped(const Item &item, + size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { @@ -601,7 +602,7 @@ __dpct_inline__ void load_subgroup_striped(size_t linear_tid, // int warp_offset = wid * ITEMS_PER_THREAD; #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - new (&items[ITEM]) InputT(block_itr[&items[ITEM].get_sub_group().get_local_range()[0]] + linear_tid + (ITEM * ITEMS_PER_THREAD)]); + new (&items[ITEM]) InputT(block_itr[item.get_sub_group().get_local_range()[0]] + linear_tid + (ITEM * ITEMS_PER_THREAD)]); } } From a4e2316aca361b055998bfa58cedd768b6867826 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Wed, 31 Jan 2024 17:13:14 +0530 Subject: [PATCH 10/65] fix lit --- .../runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 2c0be94ece06..ba638f0f80e8 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -602,7 +602,7 @@ __dpct_inline__ void load_subgroup_striped(const Item &item, // int warp_offset = wid * ITEMS_PER_THREAD; #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - new (&items[ITEM]) InputT(block_itr[item.get_sub_group().get_local_range()[0]] + linear_tid + (ITEM * ITEMS_PER_THREAD)]); + new (&items[ITEM]) InputT(block_itr[item.get_sub_group().get_local_range()[0] + linear_tid + (ITEM * ITEMS_PER_THREAD)]); } } From 95edd0ebe2538fcf8e3eaaa395efe140add2a6b0 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Thu, 1 Feb 2024 15:24:55 +0530 Subject: [PATCH 11/65] clang-format --- .../include/dpct/dpl_extras/dpcpp_extensions.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index c70ac6e5912f..93fe041da84b 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -592,10 +592,9 @@ __dpct_inline__ void load_striped(size_t linear_tid, InputIteratorT block_itr, } } -template -__dpct_inline__ void load_subgroup_striped(const Item &item, - size_t linear_tid, +template +__dpct_inline__ void load_subgroup_striped(const Item &item, size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { @@ -604,7 +603,9 @@ __dpct_inline__ void load_subgroup_striped(const Item &item, // int warp_offset = wid * ITEMS_PER_THREAD; #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - new (&items[ITEM]) InputT(block_itr[item.get_sub_group().get_local_range()[0] + linear_tid + (ITEM * ITEMS_PER_THREAD)]); + new (&items[ITEM]) + InputT(block_itr[item.get_sub_group().get_local_range()[0] + + linear_tid + (ITEM * ITEMS_PER_THREAD)]); } } From 14192538b4bf5e211154fcd37a2f68cc67addea1 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Fri, 2 Feb 2024 11:32:11 +0530 Subject: [PATCH 12/65] update review 1 --- .../dpct/dpl_extras/dpcpp_extensions.h | 29 +++++++++---------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 93fe041da84b..d38d3cb7e647 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -568,44 +568,41 @@ class radix_sort { /// Load linear segment items into block format across threads /// Helper for Block Load -template __dpct_inline__ void load_blocked(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if ((linear_tid * ITEMS_PER_THREAD) + ITEM < GROUP_WORK_ITEMS) { - items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM]; + for (int item = 0; item < ITEMS_PER_THREAD; item++) { + if ((linear_tid * ITEMS_PER_THREAD) + item < GROUP_WORK_ITEMS) { + items[item] = block_itr[(linear_tid * ITEMS_PER_THREAD) + item]; } } } -template __dpct_inline__ void load_striped(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if (linear_tid + (ITEM * ITEMS_PER_THREAD) < GROUP_WORK_ITEMS) { - items[ITEM] = block_itr[linear_tid + (ITEM * ITEMS_PER_THREAD)]; + for (int item = 0; item < ITEMS_PER_THREAD; item++) { + if (linear_tid + (item * ITEMS_PER_THREAD) < GROUP_WORK_ITEMS) { + items[item] = block_itr[linear_tid + (item * ITEMS_PER_THREAD)]; } } } -template +template __dpct_inline__ void load_subgroup_striped(const Item &item, size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { - // int tid = linear_tid & 1; - // int wid = linear_tid >> 5; - // int warp_offset = wid * ITEMS_PER_THREAD; #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - new (&items[ITEM]) + for (int item = 0; item < ITEMS_PER_THREAD; item++) { + new (&items[item]) InputT(block_itr[item.get_sub_group().get_local_range()[0] + - linear_tid + (ITEM * ITEMS_PER_THREAD)]); + linear_tid + (item * ITEMS_PER_THREAD)]); } } From 6f9902654f83bd19ce36697dc552952fae11a4ad Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Fri, 2 Feb 2024 15:17:41 +0530 Subject: [PATCH 13/65] review commit 2 --- .../dpct/dpl_extras/dpcpp_extensions.h | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index d38d3cb7e647..78c4bf1536b6 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -573,36 +573,37 @@ template __dpct_inline__ void load_striped(size_t linear_tid, InputIteratorT block_itr, - InputT (&items)[ITEMS_PER_THREAD]) { + InputT (&items)[ITEMS_PER_WORK_ITEM]) { #pragma unroll - for (int item = 0; item < ITEMS_PER_THREAD; item++) { - if (linear_tid + (item * ITEMS_PER_THREAD) < GROUP_WORK_ITEMS) { - items[item] = block_itr[linear_tid + (item * ITEMS_PER_THREAD)]; + for (signed_int idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { + if (linear_tid + (idx * ITEMS_PER_WORK_ITEM) < GROUP_WORK_ITEMS) { + items[idx] = block_itr[linear_tid + (idx * ITEMS_PER_WORK_ITEM)]; } } } -template __dpct_inline__ void load_subgroup_striped(const Item &item, size_t linear_tid, InputIteratorT block_itr, - InputT (&items)[ITEMS_PER_THREAD]) { + InputT (&items)[ITEMS_PER_WORK_ITEM]) { + int subgroup_offset = item.get_sub_group().get_local_range()[0]; #pragma unroll - for (int item = 0; item < ITEMS_PER_THREAD; item++) { - new (&items[item]) - InputT(block_itr[item.get_sub_group().get_local_range()[0] + - linear_tid + (item * ITEMS_PER_THREAD)]); + for (signed_int idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { + new (&items[idx]) + InputT(block_itr[subgroup_offset + + linear_tid + (idx * ITEMS_PER_WORK_ITEM)]); } } From eb5539aea36be5afa1cea1295cb6a89cafa8b513 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Fri, 2 Feb 2024 23:32:50 +0530 Subject: [PATCH 14/65] fix dtype --- .../dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 78c4bf1536b6..25b491d530d8 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -573,7 +573,7 @@ template Date: Mon, 5 Feb 2024 12:58:54 +0530 Subject: [PATCH 15/65] rm if stmt --- .../dpct/dpl_extras/dpcpp_extensions.h | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 25b491d530d8..68297aebb8f4 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -572,11 +572,11 @@ template __dpct_inline__ void load_blocked(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { + +uint32_t workgroup_offset = linear_tid * ITEMS_PER_THREAD; #pragma unroll - for (uint32_t idx = 0; idx < ITEMS_PER_THREAD; idx++) { - if ((linear_tid * ITEMS_PER_THREAD) + idx < GROUP_WORK_ITEMS) { - items[idx] = block_itr[(linear_tid * ITEMS_PER_THREAD) + idx]; - } + for (uint32_t idx = 0; workgroup_offset + idx < GROUP_WORK_ITEMS; idx++) { + items[idx] = block_itr[workgroup_offset + idx]; } } @@ -584,11 +584,11 @@ template __dpct_inline__ void load_striped(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { + + #pragma unroll - for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { - if (linear_tid + (idx * ITEMS_PER_WORK_ITEM) < GROUP_WORK_ITEMS) { - items[idx] = block_itr[linear_tid + (idx * ITEMS_PER_WORK_ITEM)]; - } + for (uint32_t idx = 0, uint32_t workgroup_offset = linear_tid + (idx * ITEMS_PER_WORK_ITEM); workgroup_offset < GROUP_WORK_ITEMS; idx++) { + items[idx] = block_itr[workgroup_offset]; } } @@ -600,10 +600,9 @@ __dpct_inline__ void load_subgroup_striped(const Item &item, size_t linear_tid, size_t subgroup_offset = item.get_sub_group().get_local_range()[0]; #pragma unroll - for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { + for (uint32_t idx = 0, uint32_t workgroup_offset = linear_tid + (idx * ITEMS_PER_WORK_ITEM); idx < ITEMS_PER_WORK_ITEM; idx++) { new (&items[idx]) - InputT(block_itr[subgroup_offset + - linear_tid + (idx * ITEMS_PER_WORK_ITEM)]); + InputT(block_itr[subgroup_offset + workgroup_offset]); } } From e8fc26ee1f3236f3e4918c47dbd6b3c184fc389e Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 5 Feb 2024 16:48:03 +0530 Subject: [PATCH 16/65] change design , use class --- .../dpct/dpl_extras/dpcpp_extensions.h | 74 +++++++++++-------- 1 file changed, 43 insertions(+), 31 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 68297aebb8f4..7ffed8ca3a66 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -568,44 +568,56 @@ class radix_sort { /// Load linear segment items into block format across threads /// Helper for Block Load -template -__dpct_inline__ void load_blocked(size_t linear_tid, InputIteratorT block_itr, - InputT (&items)[ITEMS_PER_THREAD]) { - -uint32_t workgroup_offset = linear_tid * ITEMS_PER_THREAD; -#pragma unroll - for (uint32_t idx = 0; workgroup_offset + idx < GROUP_WORK_ITEMS; idx++) { - items[idx] = block_itr[workgroup_offset + idx]; - } -} +enum load_algorithm{ -template -__dpct_inline__ void load_striped(size_t linear_tid, InputIteratorT block_itr, - InputT (&items)[ITEMS_PER_WORK_ITEM]) { + subgroup_load; + workgroup_load; +}; -#pragma unroll - for (uint32_t idx = 0, uint32_t workgroup_offset = linear_tid + (idx * ITEMS_PER_WORK_ITEM); workgroup_offset < GROUP_WORK_ITEMS; idx++) { - items[idx] = block_itr[workgroup_offset]; +template +class load { +public: + template + __dpct_inline__ void load_blocked(size_t linear_tid, InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD]) { + + uint32_t workgroup_offset = linear_tid * ITEMS_PER_THREAD; + #pragma unroll + for (uint32_t idx = 0; workgroup_offset + idx < GROUP_WORK_ITEMS; idx++) { + items[idx] = block_itr[workgroup_offset + idx]; + } } -} - -template -__dpct_inline__ void load_subgroup_striped(const Item &item, size_t linear_tid, - InputIteratorT block_itr, - InputT (&items)[ITEMS_PER_WORK_ITEM]) { - - size_t subgroup_offset = item.get_sub_group().get_local_range()[0]; -#pragma unroll - for (uint32_t idx = 0, uint32_t workgroup_offset = linear_tid + (idx * ITEMS_PER_WORK_ITEM); idx < ITEMS_PER_WORK_ITEM; idx++) { - new (&items[idx]) - InputT(block_itr[subgroup_offset + workgroup_offset]); + + template + __dpct_inline__ void load_striped(size_t linear_tid, InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_WORK_ITEM]) { + + #pragma unroll + for (uint32_t idx = 0, uint32_t workgroup_offset = linear_tid + (idx * ITEMS_PER_WORK_ITEM); workgroup_offset < GROUP_WORK_ITEMS; idx++) { + items[idx] = block_itr[workgroup_offset]; + } } + + template + __dpct_inline__ void load_subgroup_striped(const Item &item, size_t linear_tid, + InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_WORK_ITEM]) { + + size_t subgroup_offset = item.get_sub_group().get_local_range()[0]; + #pragma unroll + for (uint32_t idx = 0, uint32_t workgroup_offset = linear_tid + (idx * ITEMS_PER_WORK_ITEM); idx < ITEMS_PER_WORK_ITEM; idx++) { + new (&items[idx]) + InputT(block_itr[subgroup_offset + workgroup_offset]); + } } +private: +}; /// Perform a reduction of the data elements assigned to all threads in the /// group. /// From 0f7b5e4f50b54f4543061bdeac4e730a7265b618 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 5 Feb 2024 19:29:16 +0530 Subject: [PATCH 17/65] bug fix --- .../include/dpct/dpl_extras/dpcpp_extensions.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 7ffed8ca3a66..c6dd233e4b75 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -570,8 +570,8 @@ class radix_sort { /// Helper for Block Load enum load_algorithm{ - subgroup_load; - workgroup_load; + subgroup_load, + workgroup_load }; @@ -584,9 +584,9 @@ class load { public: template __dpct_inline__ void load_blocked(size_t linear_tid, InputIteratorT block_itr, - InputT (&items)[ITEMS_PER_THREAD]) { + InputT (&items)[ITEMS_PER_WORK_ITEM]) { - uint32_t workgroup_offset = linear_tid * ITEMS_PER_THREAD; + uint32_t workgroup_offset = linear_tid * ITEMS_PER_WORK_ITEM; #pragma unroll for (uint32_t idx = 0; workgroup_offset + idx < GROUP_WORK_ITEMS; idx++) { items[idx] = block_itr[workgroup_offset + idx]; @@ -598,7 +598,7 @@ class load { InputT (&items)[ITEMS_PER_WORK_ITEM]) { #pragma unroll - for (uint32_t idx = 0, uint32_t workgroup_offset = linear_tid + (idx * ITEMS_PER_WORK_ITEM); workgroup_offset < GROUP_WORK_ITEMS; idx++) { + for (uint32_t idx = 0, uint32_t workgroup_offset = (linear_tid + (idx * ITEMS_PER_WORK_ITEM)); idx < GROUP_WORK_ITEMS; idx++) { items[idx] = block_itr[workgroup_offset]; } } @@ -610,7 +610,7 @@ class load { size_t subgroup_offset = item.get_sub_group().get_local_range()[0]; #pragma unroll - for (uint32_t idx = 0, uint32_t workgroup_offset = linear_tid + (idx * ITEMS_PER_WORK_ITEM); idx < ITEMS_PER_WORK_ITEM; idx++) { + for (uint32_t idx = 0, uint32_t workgroup_offset = (linear_tid + (idx * ITEMS_PER_WORK_ITEM)); idx < ITEMS_PER_WORK_ITEM; idx++) { new (&items[idx]) InputT(block_itr[subgroup_offset + workgroup_offset]); } From 93db62addd437b4c8522728036babcb56bc33fc7 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 5 Feb 2024 21:43:49 +0530 Subject: [PATCH 18/65] remove loop assignment --- .../include/dpct/dpl_extras/dpcpp_extensions.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index c6dd233e4b75..493e864cc7fa 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -582,7 +582,7 @@ template class load { public: - template + template __dpct_inline__ void load_blocked(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { @@ -593,26 +593,26 @@ class load { } } - template + template __dpct_inline__ void load_striped(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { #pragma unroll - for (uint32_t idx = 0, uint32_t workgroup_offset = (linear_tid + (idx * ITEMS_PER_WORK_ITEM)); idx < GROUP_WORK_ITEMS; idx++) { - items[idx] = block_itr[workgroup_offset]; + for (uint32_t idx = 0; linear_tid + (idx * ITEMS_PER_WORK_ITEM) < GROUP_WORK_ITEMS; idx++) { + items[idx] = block_itr[linear_tid + (idx * ITEMS_PER_WORK_ITEM)]; } } - template + template __dpct_inline__ void load_subgroup_striped(const Item &item, size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { size_t subgroup_offset = item.get_sub_group().get_local_range()[0]; #pragma unroll - for (uint32_t idx = 0, uint32_t workgroup_offset = (linear_tid + (idx * ITEMS_PER_WORK_ITEM)); idx < ITEMS_PER_WORK_ITEM; idx++) { + for (uint32_t idx = 0; linear_tid + (idx * ITEMS_PER_WORK_ITEM) < ITEMS_PER_WORK_ITEM; idx++) { new (&items[idx]) - InputT(block_itr[subgroup_offset + workgroup_offset]); + InputT(block_itr[subgroup_offset + linear_tid + (idx * ITEMS_PER_WORK_ITEM)]); } } From 2d78e9ae6e9c868a86e498ba502d77b1e26dcf5c Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Tue, 6 Feb 2024 09:51:10 +0530 Subject: [PATCH 19/65] use pseudocode --- .../dpct/dpl_extras/dpcpp_extensions.h | 49 ++++++++++++++----- 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 493e864cc7fa..9749a140bfdd 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -586,20 +586,36 @@ class load { __dpct_inline__ void load_blocked(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { - uint32_t workgroup_offset = linear_tid * ITEMS_PER_WORK_ITEM; - #pragma unroll - for (uint32_t idx = 0; workgroup_offset + idx < GROUP_WORK_ITEMS; idx++) { - items[idx] = block_itr[workgroup_offset + idx]; + uint32_t workgroup_offset = linear_tid * ITEMS_PER_WORK_ITEM; + if (workgroup_offset > GROUP_WORK_ITEMS){ + uint32_t final_idx = (GROUP_WORK_ITEMS - workgroup_offset); + for (uint32_t idx = 0; idx < final_idx ; idx++) { + items[idx] = block_itr[workgroup_offset + idx]; + } + } + else{ + #pragma unroll + for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { + items[idx] = block_itr[workgroup_offset + idx]; + } } } template __dpct_inline__ void load_striped(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { - - #pragma unroll - for (uint32_t idx = 0; linear_tid + (idx * ITEMS_PER_WORK_ITEM) < GROUP_WORK_ITEMS; idx++) { - items[idx] = block_itr[linear_tid + (idx * ITEMS_PER_WORK_ITEM)]; + + if (linear_tid + ((ITEMS_PER_WORK_ITEM - 1)*ITEMS_PER_WORK_ITEM) > GROUP_WORK_ITEMS){ + uint32_t final_idx = (GROUP_WORK_ITEMS - linear_tid)/ITEMS_PER_WORK_ITEM; + for (uint32_t idx = 0; idx < final_idx ; idx++) { + items[idx] = block_itr[linear_tid + (idx * ITEMS_PER_WORK_ITEM)]; + } + } + else{ + #pragma unroll + for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { + items[idx] = block_itr[linear_tid + (idx * GROUP_WORK_ITEMS)]; + } } } @@ -609,12 +625,21 @@ class load { InputT (&items)[ITEMS_PER_WORK_ITEM]) { size_t subgroup_offset = item.get_sub_group().get_local_range()[0]; - #pragma unroll - for (uint32_t idx = 0; linear_tid + (idx * ITEMS_PER_WORK_ITEM) < ITEMS_PER_WORK_ITEM; idx++) { - new (&items[idx]) + if (linear_tid + ((ITEMS_PER_WORK_ITEM - 1)*ITEMS_PER_WORK_ITEM) > GROUP_WORK_ITEMS){ + uint32_t final_idx = (GROUP_WORK_ITEMS - linear_tid)/ITEMS_PER_WORK_ITEM; + for (uint32_t idx = 0; idx < final_idx ; idx++) { + new (&items[idx]) InputT(block_itr[subgroup_offset + linear_tid + (idx * ITEMS_PER_WORK_ITEM)]); + } } -} + else{ + #pragma unroll + for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { + new (&items[idx]) + InputT(block_itr[subgroup_offset + linear_tid + (idx * GROUP_WORK_ITEMS)]); + } + } + } private: }; From 1276698345d77215b060aa77dedddea484e860a6 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Tue, 6 Feb 2024 16:01:43 +0530 Subject: [PATCH 20/65] review commit 1 --- .../dpct/dpl_extras/dpcpp_extensions.h | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 9749a140bfdd..201fd4494d3e 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -570,8 +570,8 @@ class radix_sort { /// Helper for Block Load enum load_algorithm{ - subgroup_load, - workgroup_load + blocked, + striped }; @@ -580,7 +580,7 @@ template -class load { +class workgroup_load { public: template __dpct_inline__ void load_blocked(size_t linear_tid, InputIteratorT block_itr, @@ -600,7 +600,17 @@ class load { } } } - + +private: +}; + +template +class subgroup_load { +public: template __dpct_inline__ void load_striped(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { @@ -618,7 +628,7 @@ class load { } } } - + template __dpct_inline__ void load_subgroup_striped(const Item &item, size_t linear_tid, InputIteratorT block_itr, From 8d43351045fb5b38c49498da0d11fe528549ae5f Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Tue, 6 Feb 2024 21:22:50 +0530 Subject: [PATCH 21/65] add load method --- .../dpct/dpl_extras/dpcpp_extensions.h | 33 ++++++++++++++++--- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 201fd4494d3e..1c39f821d273 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -571,7 +571,8 @@ class radix_sort { enum load_algorithm{ blocked, - striped + striped, + warp_striped }; @@ -582,7 +583,7 @@ template class workgroup_load { public: - template + __dpct_inline__ void load_blocked(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { @@ -608,10 +609,10 @@ template + typename InputIteratorT, + typename Item> class subgroup_load { public: - template __dpct_inline__ void load_striped(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { @@ -629,7 +630,7 @@ class subgroup_load { } } - template + __dpct_inline__ void load_subgroup_striped(const Item &item, size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { @@ -653,6 +654,28 @@ class subgroup_load { private: }; + + __dpct_inline__ void load(size_t GROUP_WORK_ITEMS, + size_t ITEMS_PER_WORK_ITEM, + load_algorithm ALGORITHM, + InputT (&items)[ITEMS_PER_WORK_ITEM], + InputIteratorT block_itr, + Item &item){ + + if (ALGORITHM == blocked){ + workgroup_load wg_load; + wg_load.load_blocked(linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); + } + else if(ALGORITHM == striped){ + subgroup_load sg_load; + sg_load.load_striped(linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); + } + else{ + subgroup_load sg_load; + sg_load.load_subgroup_striped(item, linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); + } +} + /// Perform a reduction of the data elements assigned to all threads in the /// group. /// From 3d22cd712d8cf1ac2e2e24531aa56a254f7ded22 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Fri, 9 Feb 2024 13:47:20 +0530 Subject: [PATCH 22/65] refactor logic --- .../dpct/dpl_extras/dpcpp_extensions.h | 66 ++++++++----------- 1 file changed, 27 insertions(+), 39 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 1c39f821d273..db07e7844c10 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -583,26 +583,25 @@ template class workgroup_load { public: + __dpct_inline__ void load(load_algorithm ALGORITHM){ + + if (ALGORITHM == blocked){ + load_blocked(linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); + } + } +private: + __dpct_inline__ void load_blocked(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { uint32_t workgroup_offset = linear_tid * ITEMS_PER_WORK_ITEM; - if (workgroup_offset > GROUP_WORK_ITEMS){ - uint32_t final_idx = (GROUP_WORK_ITEMS - workgroup_offset); - for (uint32_t idx = 0; idx < final_idx ; idx++) { - items[idx] = block_itr[workgroup_offset + idx]; - } - } - else{ - #pragma unroll - for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { - items[idx] = block_itr[workgroup_offset + idx]; - } + #pragma unroll + for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { + items[idx] = block_itr[workgroup_offset + idx]; } } -private: }; template class subgroup_load { public: - __dpct_inline__ void load_striped(size_t linear_tid, InputIteratorT block_itr, + __dpct_inline__ void load(load_algorithm ALGORITHM){ + + if(ALGORITHM == striped){ + load_striped(linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); + } + else if(ALGORITHM == warp_striped){ + load_subgroup_striped(item, linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); + } +} + +private: + + __dpct_inline__ void load_striped(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { - if (linear_tid + ((ITEMS_PER_WORK_ITEM - 1)*ITEMS_PER_WORK_ITEM) > GROUP_WORK_ITEMS){ + if (linear_tid + ((ITEMS_PER_WORK_ITEM - 1)*ITEMS_PER_WORK_ITEM) >= GROUP_WORK_ITEMS){ uint32_t final_idx = (GROUP_WORK_ITEMS - linear_tid)/ITEMS_PER_WORK_ITEM; for (uint32_t idx = 0; idx < final_idx ; idx++) { items[idx] = block_itr[linear_tid + (idx * ITEMS_PER_WORK_ITEM)]; @@ -631,12 +642,12 @@ class subgroup_load { } - __dpct_inline__ void load_subgroup_striped(const Item &item, size_t linear_tid, + __dpct_inline__ void load_subgroup_striped(const Item &item, size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { size_t subgroup_offset = item.get_sub_group().get_local_range()[0]; - if (linear_tid + ((ITEMS_PER_WORK_ITEM - 1)*ITEMS_PER_WORK_ITEM) > GROUP_WORK_ITEMS){ + if (linear_tid + ((ITEMS_PER_WORK_ITEM - 1)*ITEMS_PER_WORK_ITEM) >= GROUP_WORK_ITEMS){ uint32_t final_idx = (GROUP_WORK_ITEMS - linear_tid)/ITEMS_PER_WORK_ITEM; for (uint32_t idx = 0; idx < final_idx ; idx++) { new (&items[idx]) @@ -651,31 +662,8 @@ class subgroup_load { } } } - -private: }; - __dpct_inline__ void load(size_t GROUP_WORK_ITEMS, - size_t ITEMS_PER_WORK_ITEM, - load_algorithm ALGORITHM, - InputT (&items)[ITEMS_PER_WORK_ITEM], - InputIteratorT block_itr, - Item &item){ - - if (ALGORITHM == blocked){ - workgroup_load wg_load; - wg_load.load_blocked(linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); - } - else if(ALGORITHM == striped){ - subgroup_load sg_load; - sg_load.load_striped(linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); - } - else{ - subgroup_load sg_load; - sg_load.load_subgroup_striped(item, linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); - } -} - /// Perform a reduction of the data elements assigned to all threads in the /// group. /// From 0b32a440fdfe66e7c3b7d51ffac01b5c3b1fdd5c Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Fri, 9 Feb 2024 15:49:55 +0530 Subject: [PATCH 23/65] fix bug --- .../dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index db07e7844c10..b110151817e6 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -583,7 +583,8 @@ template class workgroup_load { public: - __dpct_inline__ void load(load_algorithm ALGORITHM){ + __dpct_inline__ void load(size_t linear_tid, InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_WORK_ITEM]){ if (ALGORITHM == blocked){ load_blocked(linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); @@ -612,7 +613,9 @@ template class subgroup_load { public: - __dpct_inline__ void load(load_algorithm ALGORITHM){ + __dpct_inline__ void load(const Item &item, size_t linear_tid, + InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_WORK_ITEM]){ if(ALGORITHM == striped){ load_striped(linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); From e24ebb666b84b6fff4ef5befe5b3c298562d1453 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Wed, 14 Feb 2024 13:27:53 +0530 Subject: [PATCH 24/65] refactor post review --- .../dpct/dpl_extras/dpcpp_extensions.h | 72 ++++++------------- 1 file changed, 22 insertions(+), 50 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index b110151817e6..43871edadacf 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -583,15 +583,24 @@ template class workgroup_load { public: - __dpct_inline__ void load(size_t linear_tid, InputIteratorT block_itr, + + workgroup_load(uint8_t *local_memory) : _local_memory(local_memory) {} + + __dpct_inline__ void load(const Item &item, size_t linear_tid, + InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]){ if (ALGORITHM == blocked){ load_blocked(linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); - } - } - -private: + } + else if(ALGORITHM == striped){ + load_striped(linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); + } + else if(ALGORITHM == warp_striped){ + load_subgroup_striped(item, linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); + } + + } __dpct_inline__ void load_blocked(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { @@ -602,46 +611,15 @@ class workgroup_load { items[idx] = block_itr[workgroup_offset + idx]; } } - -}; - -template -class subgroup_load { -public: - __dpct_inline__ void load(const Item &item, size_t linear_tid, - InputIteratorT block_itr, - InputT (&items)[ITEMS_PER_WORK_ITEM]){ - - if(ALGORITHM == striped){ - load_striped(linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); - } - else if(ALGORITHM == warp_striped){ - load_subgroup_striped(item, linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); - } -} - -private: - - __dpct_inline__ void load_striped(size_t linear_tid, InputIteratorT block_itr, + + __dpct_inline__ void load_striped(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { - if (linear_tid + ((ITEMS_PER_WORK_ITEM - 1)*ITEMS_PER_WORK_ITEM) >= GROUP_WORK_ITEMS){ - uint32_t final_idx = (GROUP_WORK_ITEMS - linear_tid)/ITEMS_PER_WORK_ITEM; - for (uint32_t idx = 0; idx < final_idx ; idx++) { - items[idx] = block_itr[linear_tid + (idx * ITEMS_PER_WORK_ITEM)]; - } - } - else{ #pragma unroll - for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { + for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { items[idx] = block_itr[linear_tid + (idx * GROUP_WORK_ITEMS)]; } - } + } @@ -650,21 +628,15 @@ class subgroup_load { InputT (&items)[ITEMS_PER_WORK_ITEM]) { size_t subgroup_offset = item.get_sub_group().get_local_range()[0]; - if (linear_tid + ((ITEMS_PER_WORK_ITEM - 1)*ITEMS_PER_WORK_ITEM) >= GROUP_WORK_ITEMS){ - uint32_t final_idx = (GROUP_WORK_ITEMS - linear_tid)/ITEMS_PER_WORK_ITEM; - for (uint32_t idx = 0; idx < final_idx ; idx++) { - new (&items[idx]) - InputT(block_itr[subgroup_offset + linear_tid + (idx * ITEMS_PER_WORK_ITEM)]); - } - } - else{ #pragma unroll for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { new (&items[idx]) InputT(block_itr[subgroup_offset + linear_tid + (idx * GROUP_WORK_ITEMS)]); } - } - } + } + +private: + }; /// Perform a reduction of the data elements assigned to all threads in the From 293bf1472396050fbf4708bf0a8ac9cd5d1d3787 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Wed, 14 Feb 2024 13:29:45 +0530 Subject: [PATCH 25/65] compile time branch --- .../dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 43871edadacf..87ad85eb9d94 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -590,13 +590,13 @@ class workgroup_load { InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]){ - if (ALGORITHM == blocked){ + if constexpr (ALGORITHM == blocked){ load_blocked(linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); } - else if(ALGORITHM == striped){ + if constexpr (ALGORITHM == striped){ load_striped(linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); } - else if(ALGORITHM == warp_striped){ + if constexpr (ALGORITHM == warp_striped){ load_subgroup_striped(item, linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); } @@ -623,7 +623,7 @@ class workgroup_load { } - __dpct_inline__ void load_subgroup_striped(const Item &item, size_t linear_tid, + __dpct_inline__ void load_subgroup_striped(const Item &item, size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { From 721e722091bb769a386166c3f31503bab70c3854 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Wed, 14 Feb 2024 14:23:05 +0530 Subject: [PATCH 26/65] update comments --- .../include/dpct/dpl_extras/dpcpp_extensions.h | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 87ad85eb9d94..b5dff88e9fcf 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -601,20 +601,26 @@ class workgroup_load { } } - + //loads a linear segment of workgroup items into a blocked arrangement. __dpct_inline__ void load_blocked(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { - + + //This implementation does not take in account range loading across workgroup items + //To-do: Decide whether range loading is required for group loading + uint32_t workgroup_offset = linear_tid * ITEMS_PER_WORK_ITEM; #pragma unroll for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { items[idx] = block_itr[workgroup_offset + idx]; } } - + //loads a linear segment of workgroup items into a striped arrangement. __dpct_inline__ void load_striped(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { + //This implementation does not take in account range loading across workgroup items + //To-do: Decide whether range loading is required for group loading + #pragma unroll for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { items[idx] = block_itr[linear_tid + (idx * GROUP_WORK_ITEMS)]; @@ -622,11 +628,14 @@ class workgroup_load { } - + //loads a linear segment of workgroup items into a subgroup striped arrangement. __dpct_inline__ void load_subgroup_striped(const Item &item, size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { + //This implementation does not take in account range loading across workgroup items + //To-do: Decide whether range loading is required for group loading + size_t subgroup_offset = item.get_sub_group().get_local_range()[0]; #pragma unroll for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { From a1642560f1aff04eb554140cf41f26a66ae9c865 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Wed, 14 Feb 2024 15:49:34 +0530 Subject: [PATCH 27/65] fix bugs --- .../dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index b5dff88e9fcf..93b7f9d3b790 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -580,7 +580,8 @@ template + typename InputIteratorT, + typename Item> class workgroup_load { public: @@ -645,7 +646,8 @@ class workgroup_load { } private: - + +uint8_t *_local_memory; }; /// Perform a reduction of the data elements assigned to all threads in the From 0dc3fa0556b45c9a5c172fd44fdc937ca1727d00 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Wed, 14 Feb 2024 23:29:28 +0530 Subject: [PATCH 28/65] clang-format --- .../dpct/dpl_extras/dpcpp_extensions.h | 99 +++++++++---------- 1 file changed, 49 insertions(+), 50 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 93b7f9d3b790..83a9a18687ef 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -568,7 +568,7 @@ class radix_sort { /// Load linear segment items into block format across threads /// Helper for Block Load -enum load_algorithm{ +enum load_algorithm { blocked, striped, @@ -576,78 +576,77 @@ enum load_algorithm{ }; -template class workgroup_load { public: - workgroup_load(uint8_t *local_memory) : _local_memory(local_memory) {} __dpct_inline__ void load(const Item &item, size_t linear_tid, InputIteratorT block_itr, - InputT (&items)[ITEMS_PER_WORK_ITEM]){ - - if constexpr (ALGORITHM == blocked){ + InputT (&items)[ITEMS_PER_WORK_ITEM]) { + + if constexpr (ALGORITHM == blocked) { load_blocked(linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); - } - if constexpr (ALGORITHM == striped){ - load_striped(linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); - } - if constexpr (ALGORITHM == warp_striped){ - load_subgroup_striped(item, linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); + } + if constexpr (ALGORITHM == striped) { + load_striped(linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); + } + if constexpr (ALGORITHM == warp_striped) { + load_subgroup_striped(item, linear_tid, block_itr, + (&items)[ITEMS_PER_WORK_ITEM]); + } } - - } - //loads a linear segment of workgroup items into a blocked arrangement. + // loads a linear segment of workgroup items into a blocked arrangement. __dpct_inline__ void load_blocked(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { - //This implementation does not take in account range loading across workgroup items - //To-do: Decide whether range loading is required for group loading - - uint32_t workgroup_offset = linear_tid * ITEMS_PER_WORK_ITEM; - #pragma unroll + // This implementation does not take in account range loading across + // workgroup items To-do: Decide whether range loading is required for group + // loading + + uint32_t workgroup_offset = linear_tid * ITEMS_PER_WORK_ITEM; +#pragma unroll for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { items[idx] = block_itr[workgroup_offset + idx]; } } - //loads a linear segment of workgroup items into a striped arrangement. - __dpct_inline__ void load_striped(size_t linear_tid, InputIteratorT block_itr, + // loads a linear segment of workgroup items into a striped arrangement. + __dpct_inline__ void load_striped(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { - //This implementation does not take in account range loading across workgroup items - //To-do: Decide whether range loading is required for group loading - - #pragma unroll - for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { - items[idx] = block_itr[linear_tid + (idx * GROUP_WORK_ITEMS)]; - } - + // This implementation does not take in account range loading across + // workgroup items To-do: Decide whether range loading is required for group + // loading + +#pragma unroll + for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { + items[idx] = block_itr[linear_tid + (idx * GROUP_WORK_ITEMS)]; + } } - //loads a linear segment of workgroup items into a subgroup striped arrangement. - __dpct_inline__ void load_subgroup_striped(const Item &item, size_t linear_tid, - InputIteratorT block_itr, - InputT (&items)[ITEMS_PER_WORK_ITEM]) { - - //This implementation does not take in account range loading across workgroup items - //To-do: Decide whether range loading is required for group loading - + // loads a linear segment of workgroup items into a subgroup striped + // arrangement. + __dpct_inline__ void + load_subgroup_striped(const Item &item, size_t linear_tid, + InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_WORK_ITEM]) { + + // This implementation does not take in account range loading across + // workgroup items To-do: Decide whether range loading is required for group + // loading + size_t subgroup_offset = item.get_sub_group().get_local_range()[0]; - #pragma unroll - for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { - new (&items[idx]) - InputT(block_itr[subgroup_offset + linear_tid + (idx * GROUP_WORK_ITEMS)]); - } - } +#pragma unroll + for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { + new (&items[idx]) InputT( + block_itr[subgroup_offset + linear_tid + (idx * GROUP_WORK_ITEMS)]); + } + } private: - -uint8_t *_local_memory; + uint8_t *_local_memory; }; /// Perform a reduction of the data elements assigned to all threads in the From 59b881e64151ee940d044c7d5b45c88e433881d2 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 26 Feb 2024 11:59:27 +0530 Subject: [PATCH 29/65] review commits --- .../dpct/dpl_extras/dpcpp_extensions.h | 57 ++++++++++--------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 83a9a18687ef..889ccc9296ba 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -570,12 +570,36 @@ class radix_sort { /// Helper for Block Load enum load_algorithm { - blocked, - striped, - warp_striped + BLOCK_LOAD_DIRECT, + BLOCK_LOAD_STRIPED, + // To-do: BLOCK_LOAD_WARP_TRANSPOSE }; +// loads a linear segment of workgroup items into a subgroup striped +// arrangement. Created as free function until exchange mechanism is +// implemented. +// To-do: inline this function with BLOCK_LOAD_WARP_TRANSPOSE mechanism +template +__dpct_inline__ void +load_subgroup_striped(const Item &item, size_t linear_tid, + InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_WORK_ITEM]) { + + // This implementation does not take in account range loading across + // workgroup items To-do: Decide whether range loading is required for group + // loading + + size_t subgroup_offset = item.get_sub_group().get_local_range()[0]; +#pragma unroll + for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { + new (&items[idx]) InputT( + block_itr[subgroup_offset + linear_tid + (idx * GROUP_WORK_ITEMS)]); + } +} + template @@ -587,16 +611,12 @@ class workgroup_load { InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { - if constexpr (ALGORITHM == blocked) { + if constexpr (ALGORITHM == BLOCK_LOAD_DIRECT) { load_blocked(linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); } - if constexpr (ALGORITHM == striped) { + if constexpr (ALGORITHM == BLOCK_LOAD_STRIPED) { load_striped(linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); } - if constexpr (ALGORITHM == warp_striped) { - load_subgroup_striped(item, linear_tid, block_itr, - (&items)[ITEMS_PER_WORK_ITEM]); - } } // loads a linear segment of workgroup items into a blocked arrangement. __dpct_inline__ void load_blocked(size_t linear_tid, InputIteratorT block_itr, @@ -626,25 +646,6 @@ class workgroup_load { } } - // loads a linear segment of workgroup items into a subgroup striped - // arrangement. - __dpct_inline__ void - load_subgroup_striped(const Item &item, size_t linear_tid, - InputIteratorT block_itr, - InputT (&items)[ITEMS_PER_WORK_ITEM]) { - - // This implementation does not take in account range loading across - // workgroup items To-do: Decide whether range loading is required for group - // loading - - size_t subgroup_offset = item.get_sub_group().get_local_range()[0]; -#pragma unroll - for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { - new (&items[idx]) InputT( - block_itr[subgroup_offset + linear_tid + (idx * GROUP_WORK_ITEMS)]); - } - } - private: uint8_t *_local_memory; }; From b6f123c7c3ce4edf32811ec6d1d8c234eda2b103 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 26 Feb 2024 12:59:17 +0530 Subject: [PATCH 30/65] fix format --- .../dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 889ccc9296ba..3f4f0096b992 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -577,12 +577,11 @@ enum load_algorithm { }; // loads a linear segment of workgroup items into a subgroup striped -// arrangement. Created as free function until exchange mechanism is +// arrangement. Created as free function until exchange mechanism is // implemented. // To-do: inline this function with BLOCK_LOAD_WARP_TRANSPOSE mechanism -template +template __dpct_inline__ void load_subgroup_striped(const Item &item, size_t linear_tid, InputIteratorT block_itr, From 5436755b88e115846ac7a9c593eb424a4f9174d8 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Wed, 28 Feb 2024 23:51:06 -0800 Subject: [PATCH 31/65] review commit --- .../dpct/dpl_extras/dpcpp_extensions.h | 67 +++++++++++-------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 3f4f0096b992..0af4a5cd5091 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -576,11 +576,41 @@ enum load_algorithm { }; +// loads a linear segment of workgroup items into a blocked arrangement. +__dpct_inline__ void load_blocked(size_t linear_tid, InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_WORK_ITEM]) { + + // This implementation does not take in account range loading across + // workgroup items To-do: Decide whether range loading is required for group + // loading + + uint32_t workgroup_offset = linear_tid * ITEMS_PER_WORK_ITEM; +#pragma unroll + for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { + items[idx] = block_itr[workgroup_offset + idx]; + } + } + +// loads a linear segment of workgroup items into a striped arrangement. +__dpct_inline__ void load_striped(size_t linear_tid, InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_WORK_ITEM]) { + + // This implementation does not take in account range loading across + // workgroup items To-do: Decide whether range loading is required for group + // loading + +#pragma unroll + for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { + items[idx] = block_itr[linear_tid + (idx * GROUP_WORK_ITEMS)]; + } + } + + // loads a linear segment of workgroup items into a subgroup striped // arrangement. Created as free function until exchange mechanism is // implemented. // To-do: inline this function with BLOCK_LOAD_WARP_TRANSPOSE mechanism -template __dpct_inline__ void load_subgroup_striped(const Item &item, size_t linear_tid, @@ -592,13 +622,18 @@ load_subgroup_striped(const Item &item, size_t linear_tid, // loading size_t subgroup_offset = item.get_sub_group().get_local_range()[0]; + size_t subgroup_size = item.get_sub_group().get_local_linear_range(); + size_t subgroup_idx = item.get_sub_group().get_global_range(); + size_t inital_offset = (subgroup_id * ITEMS_PER_WORK_ITEM * subgroup_size) + subgroup_offset; #pragma unroll for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { new (&items[idx]) InputT( - block_itr[subgroup_offset + linear_tid + (idx * GROUP_WORK_ITEMS)]); + block_itr[initial_offset + (idx * subgroup_size)]); } } + + template @@ -617,33 +652,7 @@ class workgroup_load { load_striped(linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); } } - // loads a linear segment of workgroup items into a blocked arrangement. - __dpct_inline__ void load_blocked(size_t linear_tid, InputIteratorT block_itr, - InputT (&items)[ITEMS_PER_WORK_ITEM]) { - - // This implementation does not take in account range loading across - // workgroup items To-do: Decide whether range loading is required for group - // loading - - uint32_t workgroup_offset = linear_tid * ITEMS_PER_WORK_ITEM; -#pragma unroll - for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { - items[idx] = block_itr[workgroup_offset + idx]; - } - } - // loads a linear segment of workgroup items into a striped arrangement. - __dpct_inline__ void load_striped(size_t linear_tid, InputIteratorT block_itr, - InputT (&items)[ITEMS_PER_WORK_ITEM]) { - - // This implementation does not take in account range loading across - // workgroup items To-do: Decide whether range loading is required for group - // loading - -#pragma unroll - for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { - items[idx] = block_itr[linear_tid + (idx * GROUP_WORK_ITEMS)]; - } - } + private: uint8_t *_local_memory; From 118bcc1644f27bea301d41c9d6cd1d82e4b56bb3 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Thu, 29 Feb 2024 01:03:02 -0800 Subject: [PATCH 32/65] fix bugs --- .../dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 0af4a5cd5091..5304f7fdc21d 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -577,6 +577,8 @@ enum load_algorithm { }; // loads a linear segment of workgroup items into a blocked arrangement. +template __dpct_inline__ void load_blocked(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { @@ -592,6 +594,8 @@ __dpct_inline__ void load_blocked(size_t linear_tid, InputIteratorT block_itr, } // loads a linear segment of workgroup items into a striped arrangement. +template __dpct_inline__ void load_striped(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { @@ -624,7 +628,7 @@ load_subgroup_striped(const Item &item, size_t linear_tid, size_t subgroup_offset = item.get_sub_group().get_local_range()[0]; size_t subgroup_size = item.get_sub_group().get_local_linear_range(); size_t subgroup_idx = item.get_sub_group().get_global_range(); - size_t inital_offset = (subgroup_id * ITEMS_PER_WORK_ITEM * subgroup_size) + subgroup_offset; + size_t initial_offset = (subgroup_idx * ITEMS_PER_WORK_ITEM * subgroup_size) + subgroup_offset; #pragma unroll for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { new (&items[idx]) InputT( From 70608212103959bcf01dfad4095fb62df7012d63 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Fri, 1 Mar 2024 21:52:54 +0530 Subject: [PATCH 33/65] review commits --- .../dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 5304f7fdc21d..a70ea30c40e1 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -624,7 +624,8 @@ load_subgroup_striped(const Item &item, size_t linear_tid, // This implementation does not take in account range loading across // workgroup items To-do: Decide whether range loading is required for group // loading - + // This implementation uses unintialized memory for loading linear segments + // into warp striped arrangement. size_t subgroup_offset = item.get_sub_group().get_local_range()[0]; size_t subgroup_size = item.get_sub_group().get_local_linear_range(); size_t subgroup_idx = item.get_sub_group().get_global_range(); @@ -652,7 +653,7 @@ class workgroup_load { if constexpr (ALGORITHM == BLOCK_LOAD_DIRECT) { load_blocked(linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); } - if constexpr (ALGORITHM == BLOCK_LOAD_STRIPED) { + else if constexpr (ALGORITHM == BLOCK_LOAD_STRIPED) { load_striped(linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); } } From 48677b93adc330431eaa4134410a1e31bbf25e25 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 4 Mar 2024 17:55:49 +0530 Subject: [PATCH 34/65] format --- .../dpct/dpl_extras/dpcpp_extensions.h | 57 +++++++++---------- 1 file changed, 26 insertions(+), 31 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index a70ea30c40e1..d028647ec794 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -577,45 +577,44 @@ enum load_algorithm { }; // loads a linear segment of workgroup items into a blocked arrangement. -template +template __dpct_inline__ void load_blocked(size_t linear_tid, InputIteratorT block_itr, - InputT (&items)[ITEMS_PER_WORK_ITEM]) { + InputT (&items)[ITEMS_PER_WORK_ITEM]) { - // This implementation does not take in account range loading across - // workgroup items To-do: Decide whether range loading is required for group - // loading + // This implementation does not take in account range loading across + // workgroup items To-do: Decide whether range loading is required for group + // loading - uint32_t workgroup_offset = linear_tid * ITEMS_PER_WORK_ITEM; + uint32_t workgroup_offset = linear_tid * ITEMS_PER_WORK_ITEM; #pragma unroll - for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { - items[idx] = block_itr[workgroup_offset + idx]; - } + for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { + items[idx] = block_itr[workgroup_offset + idx]; } - +} + // loads a linear segment of workgroup items into a striped arrangement. -template +template __dpct_inline__ void load_striped(size_t linear_tid, InputIteratorT block_itr, - InputT (&items)[ITEMS_PER_WORK_ITEM]) { + InputT (&items)[ITEMS_PER_WORK_ITEM]) { - // This implementation does not take in account range loading across - // workgroup items To-do: Decide whether range loading is required for group - // loading + // This implementation does not take in account range loading across + // workgroup items To-do: Decide whether range loading is required for group + // loading #pragma unroll - for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { - items[idx] = block_itr[linear_tid + (idx * GROUP_WORK_ITEMS)]; - } + for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { + items[idx] = block_itr[linear_tid + (idx * GROUP_WORK_ITEMS)]; } - +} // loads a linear segment of workgroup items into a subgroup striped // arrangement. Created as free function until exchange mechanism is // implemented. // To-do: inline this function with BLOCK_LOAD_WARP_TRANSPOSE mechanism -template +template __dpct_inline__ void load_subgroup_striped(const Item &item, size_t linear_tid, InputIteratorT block_itr, @@ -629,16 +628,14 @@ load_subgroup_striped(const Item &item, size_t linear_tid, size_t subgroup_offset = item.get_sub_group().get_local_range()[0]; size_t subgroup_size = item.get_sub_group().get_local_linear_range(); size_t subgroup_idx = item.get_sub_group().get_global_range(); - size_t initial_offset = (subgroup_idx * ITEMS_PER_WORK_ITEM * subgroup_size) + subgroup_offset; + size_t initial_offset = + (subgroup_idx * ITEMS_PER_WORK_ITEM * subgroup_size) + subgroup_offset; #pragma unroll for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { - new (&items[idx]) InputT( - block_itr[initial_offset + (idx * subgroup_size)]); + new (&items[idx]) InputT(block_itr[initial_offset + (idx * subgroup_size)]); } } - - template @@ -652,12 +649,10 @@ class workgroup_load { if constexpr (ALGORITHM == BLOCK_LOAD_DIRECT) { load_blocked(linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); - } - else if constexpr (ALGORITHM == BLOCK_LOAD_STRIPED) { + } else if constexpr (ALGORITHM == BLOCK_LOAD_STRIPED) { load_striped(linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); } } - private: uint8_t *_local_memory; From 7e503273b842e73fbf6e5357361a5abf923a12dd Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 11 Mar 2024 13:29:47 +0530 Subject: [PATCH 35/65] remove redundant template arg --- .../include/dpct/dpl_extras/dpcpp_extensions.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index d028647ec794..792fe3ef57fa 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -577,8 +577,8 @@ enum load_algorithm { }; // loads a linear segment of workgroup items into a blocked arrangement. -template +template __dpct_inline__ void load_blocked(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { @@ -594,15 +594,15 @@ __dpct_inline__ void load_blocked(size_t linear_tid, InputIteratorT block_itr, } // loads a linear segment of workgroup items into a striped arrangement. -template +template __dpct_inline__ void load_striped(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { // This implementation does not take in account range loading across // workgroup items To-do: Decide whether range loading is required for group // loading - + size_t GROUP_WORK_ITEMS = item.get_global_range(); #pragma unroll for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { items[idx] = block_itr[linear_tid + (idx * GROUP_WORK_ITEMS)]; @@ -636,8 +636,8 @@ load_subgroup_striped(const Item &item, size_t linear_tid, } } -template class workgroup_load { public: From 73aab254b7e8d031dbedf4799e31475da0e779e1 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 11 Mar 2024 13:30:47 +0530 Subject: [PATCH 36/65] use auto cast --- .../dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 792fe3ef57fa..e60511cb055f 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -625,10 +625,10 @@ load_subgroup_striped(const Item &item, size_t linear_tid, // loading // This implementation uses unintialized memory for loading linear segments // into warp striped arrangement. - size_t subgroup_offset = item.get_sub_group().get_local_range()[0]; - size_t subgroup_size = item.get_sub_group().get_local_linear_range(); - size_t subgroup_idx = item.get_sub_group().get_global_range(); - size_t initial_offset = + auto subgroup_offset = item.get_sub_group().get_local_range()[0]; + auto subgroup_size = item.get_sub_group().get_local_linear_range(); + auto subgroup_idx = item.get_sub_group().get_global_range(); + auto initial_offset = (subgroup_idx * ITEMS_PER_WORK_ITEM * subgroup_size) + subgroup_offset; #pragma unroll for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { From f97e6657200995b899962bafad85ea3d932f5b34 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 11 Mar 2024 16:11:16 +0530 Subject: [PATCH 37/65] format --- .../include/dpct/dpl_extras/dpcpp_extensions.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index e60511cb055f..4407b71a0abf 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -577,8 +577,8 @@ enum load_algorithm { }; // loads a linear segment of workgroup items into a blocked arrangement. -template +template __dpct_inline__ void load_blocked(size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { @@ -594,9 +594,10 @@ __dpct_inline__ void load_blocked(size_t linear_tid, InputIteratorT block_itr, } // loads a linear segment of workgroup items into a striped arrangement. -template -__dpct_inline__ void load_striped(size_t linear_tid, InputIteratorT block_itr, +template +__dpct_inline__ void load_striped(const Item &item, size_t linear_tid, + InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { // This implementation does not take in account range loading across @@ -636,9 +637,8 @@ load_subgroup_striped(const Item &item, size_t linear_tid, } } -template +template class workgroup_load { public: workgroup_load(uint8_t *local_memory) : _local_memory(local_memory) {} @@ -650,7 +650,7 @@ class workgroup_load { if constexpr (ALGORITHM == BLOCK_LOAD_DIRECT) { load_blocked(linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); } else if constexpr (ALGORITHM == BLOCK_LOAD_STRIPED) { - load_striped(linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); + load_striped(item, linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); } } From b90f7d992091c31d6333d15f521a8a376f907c1c Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 11 Mar 2024 16:27:01 +0530 Subject: [PATCH 38/65] rename function --- .../runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 4407b71a0abf..6f61a0bcf9ec 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -617,7 +617,7 @@ __dpct_inline__ void load_striped(const Item &item, size_t linear_tid, template __dpct_inline__ void -load_subgroup_striped(const Item &item, size_t linear_tid, +uninitialized_load_subgroup_striped(const Item &item, size_t linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { From fc0ce874ed6d8f880bd2676426efe44c602cf417 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 11 Mar 2024 16:35:39 +0530 Subject: [PATCH 39/65] format --- .../dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 6f61a0bcf9ec..6f4b8c648d04 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -618,8 +618,8 @@ template __dpct_inline__ void uninitialized_load_subgroup_striped(const Item &item, size_t linear_tid, - InputIteratorT block_itr, - InputT (&items)[ITEMS_PER_WORK_ITEM]) { + InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_WORK_ITEM]) { // This implementation does not take in account range loading across // workgroup items To-do: Decide whether range loading is required for group From d4ce0b1deec8af100296631c6720b68eef7fee87 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 18 Mar 2024 18:29:49 +0530 Subject: [PATCH 40/65] use uint32_t in place of auto --- .../dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 6f4b8c648d04..7f21c4fd9fd7 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -626,10 +626,10 @@ uninitialized_load_subgroup_striped(const Item &item, size_t linear_tid, // loading // This implementation uses unintialized memory for loading linear segments // into warp striped arrangement. - auto subgroup_offset = item.get_sub_group().get_local_range()[0]; - auto subgroup_size = item.get_sub_group().get_local_linear_range(); - auto subgroup_idx = item.get_sub_group().get_global_range(); - auto initial_offset = + uint32_t subgroup_offset = item.get_sub_group().get_local_range()[0]; + uint32_t subgroup_size = item.get_sub_group().get_local_linear_range(); + uint32_t subgroup_idx = item.get_sub_group().get_global_range(); + uint32_t initial_offset = (subgroup_idx * ITEMS_PER_WORK_ITEM * subgroup_size) + subgroup_offset; #pragma unroll for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { From 29d4405e0d6bc6a6a1445f180566ace0e9daacec Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 18 Mar 2024 18:35:35 +0530 Subject: [PATCH 41/65] use item to get linear id --- .../include/dpct/dpl_extras/dpcpp_extensions.h | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 7f21c4fd9fd7..104ab926c830 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -579,13 +579,13 @@ enum load_algorithm { // loads a linear segment of workgroup items into a blocked arrangement. template -__dpct_inline__ void load_blocked(size_t linear_tid, InputIteratorT block_itr, +__dpct_inline__ void load_blocked(const Item &item, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { // This implementation does not take in account range loading across // workgroup items To-do: Decide whether range loading is required for group // loading - + size_t linear_tid = item.get_local_linear_id(); uint32_t workgroup_offset = linear_tid * ITEMS_PER_WORK_ITEM; #pragma unroll for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { @@ -596,13 +596,13 @@ __dpct_inline__ void load_blocked(size_t linear_tid, InputIteratorT block_itr, // loads a linear segment of workgroup items into a striped arrangement. template -__dpct_inline__ void load_striped(const Item &item, size_t linear_tid, - InputIteratorT block_itr, +__dpct_inline__ void load_striped(const Item &item, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { // This implementation does not take in account range loading across // workgroup items To-do: Decide whether range loading is required for group // loading + size_t linear_tid = item.get_local_linear_id(); size_t GROUP_WORK_ITEMS = item.get_global_range(); #pragma unroll for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { @@ -617,8 +617,7 @@ __dpct_inline__ void load_striped(const Item &item, size_t linear_tid, template __dpct_inline__ void -uninitialized_load_subgroup_striped(const Item &item, size_t linear_tid, - InputIteratorT block_itr, +uninitialized_load_subgroup_striped(const Item &item, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { // This implementation does not take in account range loading across @@ -643,14 +642,13 @@ class workgroup_load { public: workgroup_load(uint8_t *local_memory) : _local_memory(local_memory) {} - __dpct_inline__ void load(const Item &item, size_t linear_tid, - InputIteratorT block_itr, + __dpct_inline__ void load(const Item &item, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { if constexpr (ALGORITHM == BLOCK_LOAD_DIRECT) { - load_blocked(linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); + load_blocked(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); } else if constexpr (ALGORITHM == BLOCK_LOAD_STRIPED) { - load_striped(item, linear_tid, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); + load_striped(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); } } From a6a85ff53ef811d409c5eb47b25922050ef28207 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Tue, 19 Mar 2024 22:09:55 -0700 Subject: [PATCH 42/65] fix bug --- .../runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 104ab926c830..833e0a195652 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -578,7 +578,7 @@ enum load_algorithm { // loads a linear segment of workgroup items into a blocked arrangement. template + typename InputIteratorT, typename Item> __dpct_inline__ void load_blocked(const Item &item, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { From 6ffd681f8c4d84f4a2fbfe66463d06218696f5ae Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Thu, 21 Mar 2024 13:52:45 +0530 Subject: [PATCH 43/65] add tempstorage for load --- .../dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 833e0a195652..1168d1ab4b59 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -639,7 +639,13 @@ uninitialized_load_subgroup_striped(const Item &item, InputIteratorT block_itr, template class workgroup_load { -public: + static size_t get_local_memory_size(size_t group_threads) { + size_t ranks_size = + detail::radix_rank::get_local_memory_size(group_threads); + size_t exchange_size = + exchange::get_local_memory_size(group_threads); + return sycl::max(ranks_size, exchange_size); + } workgroup_load(uint8_t *local_memory) : _local_memory(local_memory) {} __dpct_inline__ void load(const Item &item, InputIteratorT block_itr, From ee459912f8131b2f449582f70edff3caa06ec491 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Thu, 21 Mar 2024 15:54:10 +0530 Subject: [PATCH 44/65] fix bug --- .../dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 1168d1ab4b59..4e31c500f088 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -637,14 +637,12 @@ uninitialized_load_subgroup_striped(const Item &item, InputIteratorT block_itr, } template + typename InputIteratorT, typename Item, typename T> class workgroup_load { static size_t get_local_memory_size(size_t group_threads) { - size_t ranks_size = - detail::radix_rank::get_local_memory_size(group_threads); - size_t exchange_size = + size_t group_size = exchange::get_local_memory_size(group_threads); - return sycl::max(ranks_size, exchange_size); + return group_size; } workgroup_load(uint8_t *local_memory) : _local_memory(local_memory) {} From 7c8111d9f6231201d7ebcbf0d33bb03d1fc6b85d Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Thu, 21 Mar 2024 17:21:39 +0530 Subject: [PATCH 45/65] simplify logic --- .../dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 4e31c500f088..d9e3d0eaf04c 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -640,9 +640,7 @@ template class workgroup_load { static size_t get_local_memory_size(size_t group_threads) { - size_t group_size = - exchange::get_local_memory_size(group_threads); - return group_size; + return (group_threads * ITEMS_PER_WORK_ITEM) * sizeof(T); } workgroup_load(uint8_t *local_memory) : _local_memory(local_memory) {} From a406b153728b4173b2356b77ac06ed81f3c42052 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Fri, 22 Mar 2024 00:02:01 +0530 Subject: [PATCH 46/65] fix id selection methods --- .../dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index d9e3d0eaf04c..3a5d0f04f51c 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -625,9 +625,9 @@ uninitialized_load_subgroup_striped(const Item &item, InputIteratorT block_itr, // loading // This implementation uses unintialized memory for loading linear segments // into warp striped arrangement. - uint32_t subgroup_offset = item.get_sub_group().get_local_range()[0]; + uint32_t subgroup_offset = item.get_sub_group().get_local_linear_id(); uint32_t subgroup_size = item.get_sub_group().get_local_linear_range(); - uint32_t subgroup_idx = item.get_sub_group().get_global_range(); + uint32_t subgroup_idx = item.get_sub_group().get_group_linear_id(); uint32_t initial_offset = (subgroup_idx * ITEMS_PER_WORK_ITEM * subgroup_size) + subgroup_offset; #pragma unroll From c3bc9429b911064a1a026866a70c99fa749d13f6 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 25 Mar 2024 08:34:47 +0530 Subject: [PATCH 47/65] rm local_memory unused --- .../dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 3a5d0f04f51c..d39cac40a6ac 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -642,8 +642,7 @@ class workgroup_load { static size_t get_local_memory_size(size_t group_threads) { return (group_threads * ITEMS_PER_WORK_ITEM) * sizeof(T); } - workgroup_load(uint8_t *local_memory) : _local_memory(local_memory) {} - + __dpct_inline__ void load(const Item &item, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { @@ -653,9 +652,6 @@ class workgroup_load { load_striped(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); } } - -private: - uint8_t *_local_memory; }; /// Perform a reduction of the data elements assigned to all threads in the From 69dbddcc79fcd25667b11edfc85928ba116f16cc Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 25 Mar 2024 09:22:56 +0530 Subject: [PATCH 48/65] clang format --- clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 1 - 1 file changed, 1 deletion(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index d39cac40a6ac..01182252669e 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -642,7 +642,6 @@ class workgroup_load { static size_t get_local_memory_size(size_t group_threads) { return (group_threads * ITEMS_PER_WORK_ITEM) * sizeof(T); } - __dpct_inline__ void load(const Item &item, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { From 49f5d85ea944f940d90d9863e0e089a9267e40ea Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Wed, 27 Mar 2024 11:01:59 +0530 Subject: [PATCH 49/65] update based on discussion --- .../dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 01182252669e..9a47275a2551 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -639,9 +639,11 @@ uninitialized_load_subgroup_striped(const Item &item, InputIteratorT block_itr, template class workgroup_load { - static size_t get_local_memory_size(size_t group_threads) { - return (group_threads * ITEMS_PER_WORK_ITEM) * sizeof(T); + static size_t get_local_memory_size(size_t group_work_items) { + return 0; } + workgroup_load(uint8_t *local_memory) : _local_memory(local_memory) {} + __dpct_inline__ void load(const Item &item, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { @@ -651,6 +653,9 @@ class workgroup_load { load_striped(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); } } + +private: + uint8_t *_local_memory; }; /// Perform a reduction of the data elements assigned to all threads in the From b1d8d706b8c4f86a201324b87901e02c83287e58 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Wed, 27 Mar 2024 11:10:59 +0530 Subject: [PATCH 50/65] format --- .../dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 9a47275a2551..fb30c39a808b 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -639,9 +639,7 @@ uninitialized_load_subgroup_striped(const Item &item, InputIteratorT block_itr, template class workgroup_load { - static size_t get_local_memory_size(size_t group_work_items) { - return 0; - } + static size_t get_local_memory_size(size_t group_work_items) { return 0; } workgroup_load(uint8_t *local_memory) : _local_memory(local_memory) {} __dpct_inline__ void load(const Item &item, InputIteratorT block_itr, From 7cefbf8bd3d3c91d04d0bb058a8d59127928f815 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Thu, 28 Mar 2024 11:33:56 +0530 Subject: [PATCH 51/65] update variable case --- .../dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index fb30c39a808b..a42030ad8ee6 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -603,10 +603,10 @@ __dpct_inline__ void load_striped(const Item &item, InputIteratorT block_itr, // workgroup items To-do: Decide whether range loading is required for group // loading size_t linear_tid = item.get_local_linear_id(); - size_t GROUP_WORK_ITEMS = item.get_global_range(); + size_t group_work_items = item.get_global_range(); #pragma unroll for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { - items[idx] = block_itr[linear_tid + (idx * GROUP_WORK_ITEMS)]; + items[idx] = block_itr[linear_tid + (idx * group_work_items)]; } } From fdc2f2fbb9e59370d7607c0048c4b482fabcbd4f Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Wed, 10 Apr 2024 16:33:36 +0530 Subject: [PATCH 52/65] use size_t --- .../dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index a42030ad8ee6..f8fa23cc0cec 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -588,7 +588,7 @@ __dpct_inline__ void load_blocked(const Item &item, InputIteratorT block_itr, size_t linear_tid = item.get_local_linear_id(); uint32_t workgroup_offset = linear_tid * ITEMS_PER_WORK_ITEM; #pragma unroll - for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { + for (size_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { items[idx] = block_itr[workgroup_offset + idx]; } } @@ -605,7 +605,7 @@ __dpct_inline__ void load_striped(const Item &item, InputIteratorT block_itr, size_t linear_tid = item.get_local_linear_id(); size_t group_work_items = item.get_global_range(); #pragma unroll - for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { + for (size_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { items[idx] = block_itr[linear_tid + (idx * group_work_items)]; } } @@ -631,7 +631,7 @@ uninitialized_load_subgroup_striped(const Item &item, InputIteratorT block_itr, uint32_t initial_offset = (subgroup_idx * ITEMS_PER_WORK_ITEM * subgroup_size) + subgroup_offset; #pragma unroll - for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { + for (size_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { new (&items[idx]) InputT(block_itr[initial_offset + (idx * subgroup_size)]); } } From 95db67f52cb69b6a23939d8377035a23a0dd2fd0 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Tue, 30 Apr 2024 17:50:59 +0530 Subject: [PATCH 53/65] fix issues related to tests 619 --- .../dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index f8fa23cc0cec..dac29d19f4ae 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -603,7 +603,7 @@ __dpct_inline__ void load_striped(const Item &item, InputIteratorT block_itr, // workgroup items To-do: Decide whether range loading is required for group // loading size_t linear_tid = item.get_local_linear_id(); - size_t group_work_items = item.get_global_range(); + size_t group_work_items = item.get_global_range().size(); #pragma unroll for (size_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { items[idx] = block_itr[linear_tid + (idx * group_work_items)]; @@ -637,8 +637,9 @@ uninitialized_load_subgroup_striped(const Item &item, InputIteratorT block_itr, } template + typename InputIteratorT, typename Item> class workgroup_load { +public: static size_t get_local_memory_size(size_t group_work_items) { return 0; } workgroup_load(uint8_t *local_memory) : _local_memory(local_memory) {} @@ -646,9 +647,9 @@ class workgroup_load { InputT (&items)[ITEMS_PER_WORK_ITEM]) { if constexpr (ALGORITHM == BLOCK_LOAD_DIRECT) { - load_blocked(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); + load_blocked(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); } else if constexpr (ALGORITHM == BLOCK_LOAD_STRIPED) { - load_striped(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); + load_striped(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); } } From 141ace7ba75388f55f256dfcd67c8a83b51a845e Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Tue, 30 Apr 2024 18:39:24 +0530 Subject: [PATCH 54/65] remove ALGORITHM parameter --- .../dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index dac29d19f4ae..6bf3ddb00d60 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -577,7 +577,7 @@ enum load_algorithm { }; // loads a linear segment of workgroup items into a blocked arrangement. -template __dpct_inline__ void load_blocked(const Item &item, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { @@ -594,7 +594,7 @@ __dpct_inline__ void load_blocked(const Item &item, InputIteratorT block_itr, } // loads a linear segment of workgroup items into a striped arrangement. -template __dpct_inline__ void load_striped(const Item &item, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { @@ -647,9 +647,9 @@ class workgroup_load { InputT (&items)[ITEMS_PER_WORK_ITEM]) { if constexpr (ALGORITHM == BLOCK_LOAD_DIRECT) { - load_blocked(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); + load_blocked(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); } else if constexpr (ALGORITHM == BLOCK_LOAD_STRIPED) { - load_striped(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); + load_striped(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); } } From 540db2903df9d81a8f76533cb4e9a3e846c4d588 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Wed, 1 May 2024 13:50:05 +0530 Subject: [PATCH 55/65] format fix --- .../include/dpct/dpl_extras/dpcpp_extensions.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 6bf3ddb00d60..2202510786ab 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -577,8 +577,8 @@ enum load_algorithm { }; // loads a linear segment of workgroup items into a blocked arrangement. -template +template __dpct_inline__ void load_blocked(const Item &item, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { @@ -594,8 +594,8 @@ __dpct_inline__ void load_blocked(const Item &item, InputIteratorT block_itr, } // loads a linear segment of workgroup items into a striped arrangement. -template +template __dpct_inline__ void load_striped(const Item &item, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { @@ -647,9 +647,11 @@ class workgroup_load { InputT (&items)[ITEMS_PER_WORK_ITEM]) { if constexpr (ALGORITHM == BLOCK_LOAD_DIRECT) { - load_blocked(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); + load_blocked(item, block_itr, + (&items)[ITEMS_PER_WORK_ITEM]); } else if constexpr (ALGORITHM == BLOCK_LOAD_STRIPED) { - load_striped(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); + load_striped(item, block_itr, + (&items)[ITEMS_PER_WORK_ITEM]); } } From ebf6237ed51ffe6ed3749c080e6460d121d562c6 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Thu, 2 May 2024 21:28:29 +0530 Subject: [PATCH 56/65] use local_range --- .../runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 2202510786ab..72e208dec0c2 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -603,7 +603,7 @@ __dpct_inline__ void load_striped(const Item &item, InputIteratorT block_itr, // workgroup items To-do: Decide whether range loading is required for group // loading size_t linear_tid = item.get_local_linear_id(); - size_t group_work_items = item.get_global_range().size(); + size_t group_work_items = item.get_local_range().size(); #pragma unroll for (size_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { items[idx] = block_itr[linear_tid + (idx * group_work_items)]; From 7f9d4e62de12cafd62d87d5855b50dcd2e605313 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 6 May 2024 09:02:08 +0530 Subject: [PATCH 57/65] add comments --- .../dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 72e208dec0c2..71bec256f4b6 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -635,7 +635,12 @@ uninitialized_load_subgroup_striped(const Item &item, InputIteratorT block_itr, new (&items[idx]) InputT(block_itr[initial_offset + (idx * subgroup_size)]); } } - +// template parameters : +// ITEMS_PER_WORK_ITEM: size_t varaiable controlling the number of items per thread/work_item +// ALGORITHM: load_algorithm variable controlling the type of load operation. +// InputT: typename parameter controlled at runtime from input sequence. +// InputIteratorT: typename parameter for iterator pointer controlled at runtime. +// Item : typename parameter resembling sycl::nd_item<3> . template class workgroup_load { From cb87b67cc299d8cf1352b815f7e1c785ebe0fbe5 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 6 May 2024 10:44:36 +0530 Subject: [PATCH 58/65] format --- .../runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 71bec256f4b6..c64cfb27cae5 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -636,7 +636,8 @@ uninitialized_load_subgroup_striped(const Item &item, InputIteratorT block_itr, } } // template parameters : -// ITEMS_PER_WORK_ITEM: size_t varaiable controlling the number of items per thread/work_item +// ITEMS_PER_WORK_ITEM: size_t varaiable controlling the number of items per +// thread/work_item // ALGORITHM: load_algorithm variable controlling the type of load operation. // InputT: typename parameter controlled at runtime from input sequence. // InputIteratorT: typename parameter for iterator pointer controlled at runtime. From 71d40479737a59b25bed49575cf1ed3a1cce94a3 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 6 May 2024 11:00:20 +0530 Subject: [PATCH 59/65] format --- .../dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index c64cfb27cae5..2c6878119f39 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -635,12 +635,13 @@ uninitialized_load_subgroup_striped(const Item &item, InputIteratorT block_itr, new (&items[idx]) InputT(block_itr[initial_offset + (idx * subgroup_size)]); } } -// template parameters : +// template parameters : // ITEMS_PER_WORK_ITEM: size_t varaiable controlling the number of items per // thread/work_item // ALGORITHM: load_algorithm variable controlling the type of load operation. // InputT: typename parameter controlled at runtime from input sequence. -// InputIteratorT: typename parameter for iterator pointer controlled at runtime. +// InputIteratorT: typename parameter for iterator pointer controlled at +// runtime. // Item : typename parameter resembling sycl::nd_item<3> . template From bd24713b0e916ba0badc50f72554ad994707f1a7 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Tue, 7 May 2024 14:44:24 +0530 Subject: [PATCH 60/65] Update clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h Co-authored-by: Wang, Zhiming --- .../runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 2c6878119f39..eccca09fa991 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -636,7 +636,7 @@ uninitialized_load_subgroup_striped(const Item &item, InputIteratorT block_itr, } } // template parameters : -// ITEMS_PER_WORK_ITEM: size_t varaiable controlling the number of items per +// ITEMS_PER_WORK_ITEM: size_t variable controlling the number of items per // thread/work_item // ALGORITHM: load_algorithm variable controlling the type of load operation. // InputT: typename parameter controlled at runtime from input sequence. From 26a3ae2038cd2e17a8fc6d20e1a45b533812370b Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Tue, 7 May 2024 14:44:38 +0530 Subject: [PATCH 61/65] Update clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h Co-authored-by: Wang, Zhiming --- .../runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index eccca09fa991..6a81bc8aad97 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -639,7 +639,7 @@ uninitialized_load_subgroup_striped(const Item &item, InputIteratorT block_itr, // ITEMS_PER_WORK_ITEM: size_t variable controlling the number of items per // thread/work_item // ALGORITHM: load_algorithm variable controlling the type of load operation. -// InputT: typename parameter controlled at runtime from input sequence. +// InputT: type for input sequence. // InputIteratorT: typename parameter for iterator pointer controlled at // runtime. // Item : typename parameter resembling sycl::nd_item<3> . From 6222566c820a9c4e79e044edfd992447031b2a64 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Tue, 7 May 2024 14:44:50 +0530 Subject: [PATCH 62/65] Update clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h Co-authored-by: Wang, Zhiming --- .../runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 6a81bc8aad97..41e1991b819e 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -640,7 +640,7 @@ uninitialized_load_subgroup_striped(const Item &item, InputIteratorT block_itr, // thread/work_item // ALGORITHM: load_algorithm variable controlling the type of load operation. // InputT: type for input sequence. -// InputIteratorT: typename parameter for iterator pointer controlled at +// InputIteratorT: input iterator type // runtime. // Item : typename parameter resembling sycl::nd_item<3> . template Date: Tue, 7 May 2024 18:45:51 +0530 Subject: [PATCH 63/65] format --- .../runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 41e1991b819e..1377dbed6ed1 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -640,7 +640,7 @@ uninitialized_load_subgroup_striped(const Item &item, InputIteratorT block_itr, // thread/work_item // ALGORITHM: load_algorithm variable controlling the type of load operation. // InputT: type for input sequence. -// InputIteratorT: input iterator type +// InputIteratorT: input iterator type // runtime. // Item : typename parameter resembling sycl::nd_item<3> . template Date: Wed, 8 May 2024 22:25:49 +0530 Subject: [PATCH 64/65] fix issue in referencing --- .../dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 1377dbed6ed1..89748f95e1a2 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -654,11 +654,9 @@ class workgroup_load { InputT (&items)[ITEMS_PER_WORK_ITEM]) { if constexpr (ALGORITHM == BLOCK_LOAD_DIRECT) { - load_blocked(item, block_itr, - (&items)[ITEMS_PER_WORK_ITEM]); + load_blocked(item, block_itr, items); } else if constexpr (ALGORITHM == BLOCK_LOAD_STRIPED) { - load_striped(item, block_itr, - (&items)[ITEMS_PER_WORK_ITEM]); + load_striped(item, block_itr, items); } } From d382f60831a524e6cad6e960aea435a3d91efd49 Mon Sep 17 00:00:00 2001 From: "Wang, Zhiming" Date: Thu, 9 May 2024 11:32:57 +0800 Subject: [PATCH 65/65] Update dpcpp_extensions.h --- clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 1 - 1 file changed, 1 deletion(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 89748f95e1a2..15b0b1102290 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -641,7 +641,6 @@ uninitialized_load_subgroup_striped(const Item &item, InputIteratorT block_itr, // ALGORITHM: load_algorithm variable controlling the type of load operation. // InputT: type for input sequence. // InputIteratorT: input iterator type -// runtime. // Item : typename parameter resembling sycl::nd_item<3> . template