From c08ae4837add62c6e46239808ca379d7f7e71410 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Thu, 1 Feb 2024 00:40:38 -0800 Subject: [PATCH 01/36] block load tests --- help_function/help_function.xml | 4 +- help_function/src/onedpl_test_group_load.cpp | 106 +++++++++++++++++++ 2 files changed, 108 insertions(+), 2 deletions(-) create mode 100644 help_function/src/onedpl_test_group_load.cpp diff --git a/help_function/help_function.xml b/help_function/help_function.xml index 3f6db3867..34e5c3b89 100644 --- a/help_function/help_function.xml +++ b/help_function/help_function.xml @@ -133,9 +133,10 @@ - + + @@ -143,7 +144,6 @@ - diff --git a/help_function/src/onedpl_test_group_load.cpp b/help_function/src/onedpl_test_group_load.cpp new file mode 100644 index 000000000..f8737bffa --- /dev/null +++ b/help_function/src/onedpl_test_group_load.cpp @@ -0,0 +1,106 @@ +// ====------ onedpl_test_group_load.cpp---------- -*- C++ -* ----===//// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// +// ===----------------------------------------------------------------------===// +#include +#include +#include +#include + + +bool helper_function(const int* ptr,const char* func_name){ + int expected[512]; + for (int i = 0; i < 512; ++i) { + expected[i] = i; + } + for (int i = 0; i < 512; ++i) { + if (ptr[i] != expected[i]) { + std::cout << func_name <<" failed\n"; + std::ostream_iterator Iter(std::cout, ", "); + std::copy(ptr, ptr + 512, Iter); + std::cout << std::endl; + return false; + } + } + + std::cout << func_name <<" pass\n"; + return true; +} + +bool test_load_blocked() { + sycl::queue q; + int data[512]; + for (int i = 0; i < 512; i++) data[i] = i; + + sycl::buffer buffer(data, 512); + q.submit([&](sycl::handler &h) { + sycl::accessor dacc(buffer, h, sycl::read_write); + h.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)), + [=](sycl::nd_item<3> item) { + int thread_data[4]; + auto *d = dacc.get_multi_ptr().get(); + dpct::group::load_blocked<128>(item.get_local_linear_id(), d, thread_data); + }); + }); + q.wait_and_throw(); + + sycl::host_accessor data_accessor(buffer, sycl::read_only); + const int *ptr = data_accessor.get_multi_ptr(); + return helper_function(ptr,"test_load_blocked"); +} + +bool test_load_striped() { + sycl::queue q; + int data[512]; + for (int i = 0; i < 512; i++) data[i] = i; + + sycl::buffer buffer(data, 512); + q.submit([&](sycl::handler &h) { + sycl::accessor dacc(buffer, h, sycl::read_write); + h.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)), + [=](sycl::nd_item<3> item) { + int thread_data[4]; + auto *d = dacc.get_multi_ptr().get(); + dpct::group::load_striped<128>(item.get_local_linear_id(), d, thread_data); + }); + }); + q.wait_and_throw(); + + sycl::host_accessor data_accessor(buffer, sycl::read_only); + const int *ptr = data_accessor.get_multi_ptr(); + return helper_function(ptr,"test_load_blocked"); +} + +bool test_load_subgroup_striped() { + sycl::queue q; + int data[512]; + for (int i = 0; i < 512; i++) data[i] = i; + + sycl::buffer buffer(data, 512); + q.submit([&](sycl::handler &h) { + sycl::accessor dacc(buffer, h, sycl::read_write); + h.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)), + [=](sycl::nd_item<3> item) { + int thread_data[4]; + auto *d = dacc.get_multi_ptr().get(); + dpct::group::load_subgroup_striped<128, item>(item, item.get_local_linear_id(), d, thread_data); + }); + }); + q.wait_and_throw(); + + sycl::host_accessor data_accessor(buffer, sycl::read_only); + const int *ptr = data_accessor.get_multi_ptr(); + return helper_function(ptr,"test_load_blocked"); +} + + +int main() { + return !(test_load_blocked() && test_load_striped() && test_load_warp_striped()); +} From 513b1037b5f74e88a74dc1fe6d645312e7e6743b Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Wed, 3 Apr 2024 22:19:52 +0530 Subject: [PATCH 02/36] update test --- help_function/src/onedpl_test_group_load.cpp | 29 +++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/help_function/src/onedpl_test_group_load.cpp b/help_function/src/onedpl_test_group_load.cpp index f8737bffa..cab8df6a3 100644 --- a/help_function/src/onedpl_test_group_load.cpp +++ b/help_function/src/onedpl_test_group_load.cpp @@ -12,7 +12,8 @@ #include -bool helper_function(const int* ptr,const char* func_name){ +bool helper_validation_function(const int* ptr,const char* func_name){ + // Used for validation of output and expected output sequences int expected[512]; for (int i = 0; i < 512; ++i) { expected[i] = i; @@ -38,20 +39,24 @@ bool test_load_blocked() { sycl::buffer buffer(data, 512); q.submit([&](sycl::handler &h) { - sycl::accessor dacc(buffer, h, sycl::read_write); + using workgroup_load = dpct::group::workgroup_load<128, BLOCK_LOAD_DIRECT, int>; + size_t temp_storage_size = workgroup_load::get_local_memory_size(128); + sycl::local_accessor tacc(sycl::range<1>(temp_storage_size), h); + sycl::accessor data_accessor(buffer, h, sycl::read_write); h.parallel_for( sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)), [=](sycl::nd_item<3> item) { int thread_data[4]; - auto *d = dacc.get_multi_ptr().get(); - dpct::group::load_blocked<128>(item.get_local_linear_id(), d, thread_data); + auto *d = data_accessor.get_multi_ptr().get(); + auto *tmp = tacc.get_multi_ptr().get(); + group_load(tmp).load(item,item.get_local_linear_id(), d, thread_data); }); }); q.wait_and_throw(); sycl::host_accessor data_accessor(buffer, sycl::read_only); const int *ptr = data_accessor.get_multi_ptr(); - return helper_function(ptr,"test_load_blocked"); + return helper_validation_function(ptr,"test_load_blocked"); } bool test_load_striped() { @@ -61,20 +66,24 @@ bool test_load_striped() { sycl::buffer buffer(data, 512); q.submit([&](sycl::handler &h) { - sycl::accessor dacc(buffer, h, sycl::read_write); + using workgroup_load = dpct::group::workgroup_load<128, BLOCK_LOAD_STRIPED, int>; + size_t temp_storage_size = workgroup_load::get_local_memory_size(128); + sycl::local_accessor tacc(sycl::range<1>(temp_storage_size), h); + sycl::accessor data_accessor(buffer, h, sycl::read_write); h.parallel_for( sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)), [=](sycl::nd_item<3> item) { int thread_data[4]; - auto *d = dacc.get_multi_ptr().get(); - dpct::group::load_striped<128>(item.get_local_linear_id(), d, thread_data); + auto *d = data_accessor.get_multi_ptr().get(); + auto *tmp = tacc.get_multi_ptr().get(); + group_load(tmp).load(item, d, thread_data); }); }); q.wait_and_throw(); sycl::host_accessor data_accessor(buffer, sycl::read_only); const int *ptr = data_accessor.get_multi_ptr(); - return helper_function(ptr,"test_load_blocked"); + return helper_validation_function(ptr,"test_load_blocked"); } bool test_load_subgroup_striped() { @@ -90,7 +99,7 @@ bool test_load_subgroup_striped() { [=](sycl::nd_item<3> item) { int thread_data[4]; auto *d = dacc.get_multi_ptr().get(); - dpct::group::load_subgroup_striped<128, item>(item, item.get_local_linear_id(), d, thread_data); + dpct::group::uninitialized_load_subgroup_striped<128>(item, d, thread_data); }); }); q.wait_and_throw(); From d8b7cb7734b361116070eaa09546f28b1048cd0b Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Thu, 4 Apr 2024 11:50:52 +0530 Subject: [PATCH 03/36] use onedpl iterator --- help_function/src/onedpl_test_group_load.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/help_function/src/onedpl_test_group_load.cpp b/help_function/src/onedpl_test_group_load.cpp index cab8df6a3..e5a8fc01c 100644 --- a/help_function/src/onedpl_test_group_load.cpp +++ b/help_function/src/onedpl_test_group_load.cpp @@ -10,14 +10,13 @@ #include #include #include +#include bool helper_validation_function(const int* ptr,const char* func_name){ // Used for validation of output and expected output sequences - int expected[512]; - for (int i = 0; i < 512; ++i) { - expected[i] = i; - } + oneapi::dpl::counting_iterator expected(0); + for (int i = 0; i < 512; ++i) { if (ptr[i] != expected[i]) { std::cout << func_name <<" failed\n"; From be5eefe731f553ee9eea8b35ece76305c7bcf425 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Fri, 5 Apr 2024 13:22:42 +0530 Subject: [PATCH 04/36] rm duplicate function & iterator --- help_function/src/onedpl_test_group_load.cpp | 45 ++++---------------- 1 file changed, 8 insertions(+), 37 deletions(-) diff --git a/help_function/src/onedpl_test_group_load.cpp b/help_function/src/onedpl_test_group_load.cpp index e5a8fc01c..1c361cd2f 100644 --- a/help_function/src/onedpl_test_group_load.cpp +++ b/help_function/src/onedpl_test_group_load.cpp @@ -15,10 +15,9 @@ bool helper_validation_function(const int* ptr,const char* func_name){ // Used for validation of output and expected output sequences - oneapi::dpl::counting_iterator expected(0); for (int i = 0; i < 512; ++i) { - if (ptr[i] != expected[i]) { + if (ptr[i] != i) { std::cout << func_name <<" failed\n"; std::ostream_iterator Iter(std::cout, ", "); std::copy(ptr, ptr + 512, Iter); @@ -31,14 +30,13 @@ bool helper_validation_function(const int* ptr,const char* func_name){ return true; } -bool test_load_blocked() { +bool test_load_blocked_striped(dpct::group::load_algorithm T) { sycl::queue q; - int data[512]; - for (int i = 0; i < 512; i++) data[i] = i; - - sycl::buffer buffer(data, 512); + oneapi::dpl::counting_iterator count_it(0); + sycl::buffer buffer(count_it, count_it + 512); + q.submit([&](sycl::handler &h) { - using workgroup_load = dpct::group::workgroup_load<128, BLOCK_LOAD_DIRECT, int>; + using workgroup_load = dpct::group::workgroup_load<128, T, int>; size_t temp_storage_size = workgroup_load::get_local_memory_size(128); sycl::local_accessor tacc(sycl::range<1>(temp_storage_size), h); sycl::accessor data_accessor(buffer, h, sycl::read_write); @@ -58,33 +56,6 @@ bool test_load_blocked() { return helper_validation_function(ptr,"test_load_blocked"); } -bool test_load_striped() { - sycl::queue q; - int data[512]; - for (int i = 0; i < 512; i++) data[i] = i; - - sycl::buffer buffer(data, 512); - q.submit([&](sycl::handler &h) { - using workgroup_load = dpct::group::workgroup_load<128, BLOCK_LOAD_STRIPED, int>; - size_t temp_storage_size = workgroup_load::get_local_memory_size(128); - sycl::local_accessor tacc(sycl::range<1>(temp_storage_size), h); - sycl::accessor data_accessor(buffer, h, sycl::read_write); - h.parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)), - [=](sycl::nd_item<3> item) { - int thread_data[4]; - auto *d = data_accessor.get_multi_ptr().get(); - auto *tmp = tacc.get_multi_ptr().get(); - group_load(tmp).load(item, d, thread_data); - }); - }); - q.wait_and_throw(); - - sycl::host_accessor data_accessor(buffer, sycl::read_only); - const int *ptr = data_accessor.get_multi_ptr(); - return helper_validation_function(ptr,"test_load_blocked"); -} - bool test_load_subgroup_striped() { sycl::queue q; int data[512]; @@ -105,10 +76,10 @@ bool test_load_subgroup_striped() { sycl::host_accessor data_accessor(buffer, sycl::read_only); const int *ptr = data_accessor.get_multi_ptr(); - return helper_function(ptr,"test_load_blocked"); + return helper_validation_function(ptr,"test_load_blocked"); } int main() { - return !(test_load_blocked() && test_load_striped() && test_load_warp_striped()); + return !(test_load_blocked_striped(BLOCK_LOAD_DIRECT) && test_load_blocked_striped(BLOCK_LOAD_STRIPED) && test_load_subgroup_striped()); } From 66ec770dae585389dcb7fc9eb81645888ed1174a Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Tue, 9 Apr 2024 13:12:44 +0530 Subject: [PATCH 05/36] Update help_function/src/onedpl_test_group_load.cpp Co-authored-by: Yihan Wang --- help_function/src/onedpl_test_group_load.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/help_function/src/onedpl_test_group_load.cpp b/help_function/src/onedpl_test_group_load.cpp index 1c361cd2f..0fa5f5d4a 100644 --- a/help_function/src/onedpl_test_group_load.cpp +++ b/help_function/src/onedpl_test_group_load.cpp @@ -1,4 +1,5 @@ -// ====------ onedpl_test_group_load.cpp---------- -*- C++ -* ----===//// +// ====------ onedpl_test_group_load.cpp------------ *- C++ -* ----===// + // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. From 146c0d17ec64822636ac856382abf87875c879c0 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Tue, 9 Apr 2024 13:12:55 +0530 Subject: [PATCH 06/36] Update help_function/src/onedpl_test_group_load.cpp Co-authored-by: Yihan Wang --- help_function/src/onedpl_test_group_load.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/help_function/src/onedpl_test_group_load.cpp b/help_function/src/onedpl_test_group_load.cpp index 0fa5f5d4a..f6647030a 100644 --- a/help_function/src/onedpl_test_group_load.cpp +++ b/help_function/src/onedpl_test_group_load.cpp @@ -11,7 +11,8 @@ #include #include #include -#include +#include + bool helper_validation_function(const int* ptr,const char* func_name){ From ec5d9a8c3ba9bd91872cb5c5000185428efcca65 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Tue, 9 Apr 2024 13:13:11 +0530 Subject: [PATCH 07/36] Update help_function/src/onedpl_test_group_load.cpp Co-authored-by: Yihan Wang --- help_function/src/onedpl_test_group_load.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/help_function/src/onedpl_test_group_load.cpp b/help_function/src/onedpl_test_group_load.cpp index f6647030a..1fbed0e70 100644 --- a/help_function/src/onedpl_test_group_load.cpp +++ b/help_function/src/onedpl_test_group_load.cpp @@ -15,7 +15,8 @@ -bool helper_validation_function(const int* ptr,const char* func_name){ +bool helper_validation_function(const int* ptr, const char* func_name) { + // Used for validation of output and expected output sequences for (int i = 0; i < 512; ++i) { From 400f48d655d183f89afc3473c8e35e5dd49b39e8 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Thu, 11 Apr 2024 10:53:28 +0530 Subject: [PATCH 08/36] rm unwanted test --- help_function/help_function.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/help_function/help_function.xml b/help_function/help_function.xml index 34e5c3b89..8e9a52929 100644 --- a/help_function/help_function.xml +++ b/help_function/help_function.xml @@ -133,7 +133,6 @@ - From 5878cb710dc241e0970505f9cfcbe25f500e1fd0 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Thu, 11 Apr 2024 12:41:06 +0530 Subject: [PATCH 09/36] update review comments --- help_function/src/onedpl_test_group_load.cpp | 52 +++++++++++++++++--- 1 file changed, 45 insertions(+), 7 deletions(-) diff --git a/help_function/src/onedpl_test_group_load.cpp b/help_function/src/onedpl_test_group_load.cpp index 1fbed0e70..63f2071fb 100644 --- a/help_function/src/onedpl_test_group_load.cpp +++ b/help_function/src/onedpl_test_group_load.cpp @@ -43,23 +43,28 @@ bool test_load_blocked_striped(dpct::group::load_algorithm T) { size_t temp_storage_size = workgroup_load::get_local_memory_size(128); sycl::local_accessor tacc(sycl::range<1>(temp_storage_size), h); sycl::accessor data_accessor(buffer, h, sycl::read_write); + int thread_data[4]; h.parallel_for( sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)), [=](sycl::nd_item<3> item) { - int thread_data[4]; auto *d = data_accessor.get_multi_ptr().get(); auto *tmp = tacc.get_multi_ptr().get(); - group_load(tmp).load(item,item.get_local_linear_id(), d, thread_data); + group_load(tmp).load(item, d, thread_data); + // Write thread_data of each work item at index to the global buffer + int global_index = item.get_global_linear_id() * 4; // Each thread_data has 4 elements + for (int i = 0; i < 4; ++i) { + buffer[global_index + i] = thread_data[i]; + } }); }); q.wait_and_throw(); - + sycl::host_accessor data_accessor(buffer, sycl::read_only); const int *ptr = data_accessor.get_multi_ptr(); - return helper_validation_function(ptr,"test_load_blocked"); + return helper_validation_function(ptr); } -bool test_load_subgroup_striped() { +bool test_load_subgroup_striped_standalone() { sycl::queue q; int data[512]; for (int i = 0; i < 512; i++) data[i] = i; @@ -73,16 +78,49 @@ bool test_load_subgroup_striped() { int thread_data[4]; auto *d = dacc.get_multi_ptr().get(); dpct::group::uninitialized_load_subgroup_striped<128>(item, d, thread_data); + // Write thread_data of each work item at index to the global buffer + int global_index = item.get_global_linear_id() * 4; // Each thread_data has 4 elements + for (int i = 0; i < 4; ++i) { + buffer[global_index + i] = thread_data[i]; + } }); }); q.wait_and_throw(); sycl::host_accessor data_accessor(buffer, sycl::read_only); const int *ptr = data_accessor.get_multi_ptr(); - return helper_validation_function(ptr,"test_load_blocked"); + return helper_validation_function(ptr); } +bool test_load_blocked_striped_standalone(dpct::group::load_algorithm T) { + sycl::queue q; + int data[512]; + for (int i = 0; i < 512; i++) data[i] = i; + + sycl::buffer buffer(data, 512); + q.submit([&](sycl::handler &h) { + sycl::accessor dacc(buffer, h, sycl::read_write); + h.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)), + [=](sycl::nd_item<3> item) { + int thread_data[4]; + auto *d = dacc.get_multi_ptr().get(); + dpct::group::load_blocked<128, T>(item, d, thread_data); + // Write thread_data of each work item at index to the global buffer + int global_index = item.get_global_linear_id() * 4; // Each thread_data has 4 elements + for (int i = 0; i < 4; ++i) { + buffer[global_index + i] = thread_data[i]; + } + }); + }); + q.wait_and_throw(); + + sycl::host_accessor data_accessor(buffer, sycl::read_only); + const int *ptr = data_accessor.get_multi_ptr(); + return helper_validation_function(ptr); +} int main() { - return !(test_load_blocked_striped(BLOCK_LOAD_DIRECT) && test_load_blocked_striped(BLOCK_LOAD_STRIPED) && test_load_subgroup_striped()); + return !(test_load_blocked_striped(dpct::group::load_algorithm::BLOCK_LOAD_DIRECT) && test_load_blocked_striped(dpct::group::load_algorithm::BLOCK_LOAD_STRIPED) && test_load_subgroup_striped_standalone() + && test_load_blocked_striped_standalone(dpct::group::load_algorithm::BLOCK_LOAD_STRIPED) && test_load_blocked_striped_standalone(dpct::group::load_algorithm::BLOCK_LOAD_DIRECT)); } From ae0e1d64461871f0faaf293d809dccd1400080d8 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Thu, 11 Apr 2024 12:44:44 +0530 Subject: [PATCH 10/36] rebase from upstream --- help_function/help_function.xml | 1 + 1 file changed, 1 insertion(+) diff --git a/help_function/help_function.xml b/help_function/help_function.xml index 6a6eaadbe..2e9f9f2f8 100644 --- a/help_function/help_function.xml +++ b/help_function/help_function.xml @@ -140,6 +140,7 @@ + From d2ae0ee86f745232e6ce51ff85dfc5e139a9d8ee Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Tue, 30 Apr 2024 00:34:44 +0530 Subject: [PATCH 11/36] update tests --- help_function/src/onedpl_test_group_load.cpp | 78 +++++++++++++------- 1 file changed, 52 insertions(+), 26 deletions(-) diff --git a/help_function/src/onedpl_test_group_load.cpp b/help_function/src/onedpl_test_group_load.cpp index 63f2071fb..98acc5b82 100644 --- a/help_function/src/onedpl_test_group_load.cpp +++ b/help_function/src/onedpl_test_group_load.cpp @@ -13,27 +13,9 @@ #include #include - - -bool helper_validation_function(const int* ptr, const char* func_name) { - - // Used for validation of output and expected output sequences - - for (int i = 0; i < 512; ++i) { - if (ptr[i] != i) { - std::cout << func_name <<" failed\n"; - std::ostream_iterator Iter(std::cout, ", "); - std::copy(ptr, ptr + 512, Iter); - std::cout << std::endl; - return false; - } - } - - std::cout << func_name <<" pass\n"; - return true; -} - bool test_load_blocked_striped(dpct::group::load_algorithm T) { + // Tests dpct::group::load_algorithm::BLOCK_LOAD_DIRECT & dpct::group::load_algorithm::BLOCK_LOAD_STRIPED + // in its entirety as API functions sycl::queue q; oneapi::dpl::counting_iterator count_it(0); sycl::buffer buffer(count_it, count_it + 512); @@ -61,10 +43,22 @@ bool test_load_blocked_striped(dpct::group::load_algorithm T) { sycl::host_accessor data_accessor(buffer, sycl::read_only); const int *ptr = data_accessor.get_multi_ptr(); - return helper_validation_function(ptr); + for (int i = 0; i < 512; ++i) { + if (ptr[i] != i) { + std::cout << func_name <<" failed\n"; + std::ostream_iterator Iter(std::cout, ", "); + std::copy(ptr, ptr + 512, Iter); + std::cout << std::endl; + return false; + } + } + + std::cout <<" pass\n"; + return true; } bool test_load_subgroup_striped_standalone() { + // Tests dpct::group::load_subgroup_striped as standalone method sycl::queue q; int data[512]; for (int i = 0; i < 512; i++) data[i] = i; @@ -80,8 +74,8 @@ bool test_load_subgroup_striped_standalone() { dpct::group::uninitialized_load_subgroup_striped<128>(item, d, thread_data); // Write thread_data of each work item at index to the global buffer int global_index = item.get_global_linear_id() * 4; // Each thread_data has 4 elements - for (int i = 0; i < 4; ++i) { - buffer[global_index + i] = thread_data[i]; + for (int i = 0; i < 4; ++i) { + dacc[global_index + i] = thread_data[i]; } }); }); @@ -89,10 +83,30 @@ bool test_load_subgroup_striped_standalone() { sycl::host_accessor data_accessor(buffer, sycl::read_only); const int *ptr = data_accessor.get_multi_ptr(); - return helper_validation_function(ptr); + int expected[512]; + for (int i = 0; i < 128; i++) { + expected[4 * i + 0] = i; + expected[4 * i + 1] = 4 * i + 1; + expected[4 * i + 2] = 4 * i + 2; + expected[4 * i + 3] = 4 * i + 3; + } + for (int i = 0; i < 512; ++i) { + if (ptr[i] != expected[i]) { + std::cout << func_name <<" failed\n"; + std::ostream_iterator Iter(std::cout, ", "); + std::copy(ptr, ptr + 512, Iter); + std::cout << std::endl; + return false; + } + } + + std::cout <<" pass\n"; + return true; } bool test_load_blocked_striped_standalone(dpct::group::load_algorithm T) { + // Tests dpct::group::load_algorithm::BLOCK_LOAD_DIRECT & dpct::group::load_algorithm::BLOCK_LOAD_STRIPED + // as standalone methods sycl::queue q; int data[512]; for (int i = 0; i < 512; i++) data[i] = i; @@ -109,7 +123,7 @@ bool test_load_blocked_striped_standalone(dpct::group::load_algorithm T) { // Write thread_data of each work item at index to the global buffer int global_index = item.get_global_linear_id() * 4; // Each thread_data has 4 elements for (int i = 0; i < 4; ++i) { - buffer[global_index + i] = thread_data[i]; + dacc[global_index + i] = thread_data[i]; } }); }); @@ -117,10 +131,22 @@ bool test_load_blocked_striped_standalone(dpct::group::load_algorithm T) { sycl::host_accessor data_accessor(buffer, sycl::read_only); const int *ptr = data_accessor.get_multi_ptr(); - return helper_validation_function(ptr); + for (int i = 0; i < 512; ++i) { + if (ptr[i] != i) { + std::cout << func_name <<" failed\n"; + std::ostream_iterator Iter(std::cout, ", "); + std::copy(ptr, ptr + 512, Iter); + std::cout << std::endl; + return false; + } + } + + std::cout <<" pass\n"; + return true; } int main() { + return !(test_load_blocked_striped(dpct::group::load_algorithm::BLOCK_LOAD_DIRECT) && test_load_blocked_striped(dpct::group::load_algorithm::BLOCK_LOAD_STRIPED) && test_load_subgroup_striped_standalone() && test_load_blocked_striped_standalone(dpct::group::load_algorithm::BLOCK_LOAD_STRIPED) && test_load_blocked_striped_standalone(dpct::group::load_algorithm::BLOCK_LOAD_DIRECT)); } From 23b7023c2b232c14d632c49134d2ff6cc49df726 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Tue, 30 Apr 2024 10:03:56 +0530 Subject: [PATCH 12/36] update template param --- help_function/src/onedpl_test_group_load.cpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/help_function/src/onedpl_test_group_load.cpp b/help_function/src/onedpl_test_group_load.cpp index 98acc5b82..9dcd4e4ed 100644 --- a/help_function/src/onedpl_test_group_load.cpp +++ b/help_function/src/onedpl_test_group_load.cpp @@ -13,7 +13,8 @@ #include #include -bool test_load_blocked_striped(dpct::group::load_algorithm T) { +template +bool test_load_blocked_striped() { // Tests dpct::group::load_algorithm::BLOCK_LOAD_DIRECT & dpct::group::load_algorithm::BLOCK_LOAD_STRIPED // in its entirety as API functions sycl::queue q; @@ -56,7 +57,7 @@ bool test_load_blocked_striped(dpct::group::load_algorithm T) { std::cout <<" pass\n"; return true; } - +template bool test_load_subgroup_striped_standalone() { // Tests dpct::group::load_subgroup_striped as standalone method sycl::queue q; @@ -71,7 +72,7 @@ bool test_load_subgroup_striped_standalone() { [=](sycl::nd_item<3> item) { int thread_data[4]; auto *d = dacc.get_multi_ptr().get(); - dpct::group::uninitialized_load_subgroup_striped<128>(item, d, thread_data); + dpct::group::uninitialized_load_subgroup_striped<128, T, int>(item, d, thread_data); // Write thread_data of each work item at index to the global buffer int global_index = item.get_global_linear_id() * 4; // Each thread_data has 4 elements for (int i = 0; i < 4; ++i) { @@ -104,7 +105,8 @@ bool test_load_subgroup_striped_standalone() { return true; } -bool test_load_blocked_striped_standalone(dpct::group::load_algorithm T) { +template +bool test_load_blocked_striped_standalone() { // Tests dpct::group::load_algorithm::BLOCK_LOAD_DIRECT & dpct::group::load_algorithm::BLOCK_LOAD_STRIPED // as standalone methods sycl::queue q; @@ -119,7 +121,7 @@ bool test_load_blocked_striped_standalone(dpct::group::load_algorithm T) { [=](sycl::nd_item<3> item) { int thread_data[4]; auto *d = dacc.get_multi_ptr().get(); - dpct::group::load_blocked<128, T>(item, d, thread_data); + dpct::group::load_blocked<128, T, int>(item, d, thread_data); // Write thread_data of each work item at index to the global buffer int global_index = item.get_global_linear_id() * 4; // Each thread_data has 4 elements for (int i = 0; i < 4; ++i) { @@ -147,6 +149,6 @@ bool test_load_blocked_striped_standalone(dpct::group::load_algorithm T) { int main() { - return !(test_load_blocked_striped(dpct::group::load_algorithm::BLOCK_LOAD_DIRECT) && test_load_blocked_striped(dpct::group::load_algorithm::BLOCK_LOAD_STRIPED) && test_load_subgroup_striped_standalone() - && test_load_blocked_striped_standalone(dpct::group::load_algorithm::BLOCK_LOAD_STRIPED) && test_load_blocked_striped_standalone(dpct::group::load_algorithm::BLOCK_LOAD_DIRECT)); + return !(test_load_blocked_striped() && test_load_blocked_striped() && test_load_subgroup_striped_standalone() + && test_load_blocked_striped_standalone() && test_load_blocked_striped_standalone()); } From 7c2ff7e493236a56140dc8f59d56128dad6f6ecc Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Tue, 30 Apr 2024 14:13:01 +0530 Subject: [PATCH 13/36] fix compile issues --- help_function/src/onedpl_test_group_load.cpp | 85 ++++++++++++++------ 1 file changed, 62 insertions(+), 23 deletions(-) diff --git a/help_function/src/onedpl_test_group_load.cpp b/help_function/src/onedpl_test_group_load.cpp index 9dcd4e4ed..144bbaadc 100644 --- a/help_function/src/onedpl_test_group_load.cpp +++ b/help_function/src/onedpl_test_group_load.cpp @@ -13,7 +13,7 @@ #include #include -template +template bool test_load_blocked_striped() { // Tests dpct::group::load_algorithm::BLOCK_LOAD_DIRECT & dpct::group::load_algorithm::BLOCK_LOAD_STRIPED // in its entirety as API functions @@ -22,11 +22,12 @@ bool test_load_blocked_striped() { sycl::buffer buffer(count_it, count_it + 512); q.submit([&](sycl::handler &h) { - using workgroup_load = dpct::group::workgroup_load<128, T, int>; - size_t temp_storage_size = workgroup_load::get_local_memory_size(128); - sycl::local_accessor tacc(sycl::range<1>(temp_storage_size), h); sycl::accessor data_accessor(buffer, h, sycl::read_write); int thread_data[4]; + using group_load = dpct::group::workgroup_load<4, T, int, sycl::accessor, sycl::nd_item<3>>; + size_t temp_storage_size = group_load::get_local_memory_size(128); + sycl::local_accessor tacc(sycl::range<1>(temp_storage_size), h); + h.parallel_for( sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)), [=](sycl::nd_item<3> item) { @@ -36,7 +37,7 @@ bool test_load_blocked_striped() { // Write thread_data of each work item at index to the global buffer int global_index = item.get_global_linear_id() * 4; // Each thread_data has 4 elements for (int i = 0; i < 4; ++i) { - buffer[global_index + i] = thread_data[i]; + data_accessor[global_index + i] = thread_data[i]; } }); }); @@ -46,7 +47,7 @@ bool test_load_blocked_striped() { const int *ptr = data_accessor.get_multi_ptr(); for (int i = 0; i < 512; ++i) { if (ptr[i] != i) { - std::cout << func_name <<" failed\n"; + std::cout <<" failed\n"; std::ostream_iterator Iter(std::cout, ", "); std::copy(ptr, ptr + 512, Iter); std::cout << std::endl; @@ -57,7 +58,7 @@ bool test_load_blocked_striped() { std::cout <<" pass\n"; return true; } -template + bool test_load_subgroup_striped_standalone() { // Tests dpct::group::load_subgroup_striped as standalone method sycl::queue q; @@ -72,7 +73,7 @@ bool test_load_subgroup_striped_standalone() { [=](sycl::nd_item<3> item) { int thread_data[4]; auto *d = dacc.get_multi_ptr().get(); - dpct::group::uninitialized_load_subgroup_striped<128, T, int>(item, d, thread_data); + dpct::group::uninitialized_load_subgroup_striped<4, int>(item, d, thread_data); // Write thread_data of each work item at index to the global buffer int global_index = item.get_global_linear_id() * 4; // Each thread_data has 4 elements for (int i = 0; i < 4; ++i) { @@ -93,7 +94,7 @@ bool test_load_subgroup_striped_standalone() { } for (int i = 0; i < 512; ++i) { if (ptr[i] != expected[i]) { - std::cout << func_name <<" failed\n"; + std::cout <<" failed\n"; std::ostream_iterator Iter(std::cout, ", "); std::copy(ptr, ptr + 512, Iter); std::cout << std::endl; @@ -105,7 +106,7 @@ bool test_load_subgroup_striped_standalone() { return true; } -template +template bool test_load_blocked_striped_standalone() { // Tests dpct::group::load_algorithm::BLOCK_LOAD_DIRECT & dpct::group::load_algorithm::BLOCK_LOAD_STRIPED // as standalone methods @@ -121,7 +122,10 @@ bool test_load_blocked_striped_standalone() { [=](sycl::nd_item<3> item) { int thread_data[4]; auto *d = dacc.get_multi_ptr().get(); - dpct::group::load_blocked<128, T, int>(item, d, thread_data); + if( T == dpct::group::load_algorithm::BLOCK_LOAD_DIRECT) + {dpct::group::load_blocked<4, T, int>(item, d, thread_data);} + else + {dpct::group::load_striped<4, T, int>(item, d, thread_data);} // Write thread_data of each work item at index to the global buffer int global_index = item.get_global_linear_id() * 4; // Each thread_data has 4 elements for (int i = 0; i < 4; ++i) { @@ -133,22 +137,57 @@ bool test_load_blocked_striped_standalone() { sycl::host_accessor data_accessor(buffer, sycl::read_only); const int *ptr = data_accessor.get_multi_ptr(); - for (int i = 0; i < 512; ++i) { - if (ptr[i] != i) { - std::cout << func_name <<" failed\n"; - std::ostream_iterator Iter(std::cout, ", "); - std::copy(ptr, ptr + 512, Iter); - std::cout << std::endl; - return false; - } + if(T == dpct::group::load_algorithm::BLOCK_LOAD_DIRECT) + { + for (int i = 0; i < 512; ++i) { + if (ptr[i] != i) { + std::cout <<" failed\n"; + std::ostream_iterator Iter(std::cout, ", "); + std::copy(ptr, ptr + 512, Iter); + std::cout << std::endl; + return false; + } + } + + std::cout <<" pass\n"; + return true; } + else{ + int expected[512]; + + for (int i = 4; i < 128; ++i) { + expected[4 * i + 0] = 4 * i + (128 * 0); + expected[4 * i + 1] = 4 * i + 1 + (128 * 1); + expected[4 * i + 2] = 4 * i + 2 + (128 * 2); + expected[4 * i + 3] = 4 * i + 3 + (128 * 3); + + } - std::cout <<" pass\n"; - return true; + for(int i=0;i<512;i++){std::cout< Iter(std::cout, ", "); + std::copy(ptr, ptr + 512, Iter); + std::cout << std::endl; + return false; + } + } + + std::cout <<" pass\n"; + return true; + + } } + int main() { - return !(test_load_blocked_striped() && test_load_blocked_striped() && test_load_subgroup_striped_standalone() - && test_load_blocked_striped_standalone() && test_load_blocked_striped_standalone()); + return !(//test_load_blocked_striped() && test_load_blocked_striped() && + test_load_subgroup_striped_standalone() && + test_load_blocked_striped_standalone() && test_load_blocked_striped_standalone()); } From fe51693ac93c3454ecb000e3951c6283b2b6d0a1 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Tue, 30 Apr 2024 17:38:44 +0530 Subject: [PATCH 14/36] update & fix all tests --- help_function/src/onedpl_test_group_load.cpp | 55 ++++++++++---------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/help_function/src/onedpl_test_group_load.cpp b/help_function/src/onedpl_test_group_load.cpp index 144bbaadc..267b971dc 100644 --- a/help_function/src/onedpl_test_group_load.cpp +++ b/help_function/src/onedpl_test_group_load.cpp @@ -22,15 +22,15 @@ bool test_load_blocked_striped() { sycl::buffer buffer(count_it, count_it + 512); q.submit([&](sycl::handler &h) { - sycl::accessor data_accessor(buffer, h, sycl::read_write); - int thread_data[4]; - using group_load = dpct::group::workgroup_load<4, T, int, sycl::accessor, sycl::nd_item<3>>; + using group_load = dpct::group::workgroup_load<4, T, int, int *, sycl::nd_item<3>>; size_t temp_storage_size = group_load::get_local_memory_size(128); sycl::local_accessor tacc(sycl::range<1>(temp_storage_size), h); + sycl::accessor data_accessor(buffer, h, sycl::read_write); h.parallel_for( sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)), [=](sycl::nd_item<3> item) { + int thread_data[4]; auto *d = data_accessor.get_multi_ptr().get(); auto *tmp = tacc.get_multi_ptr().get(); group_load(tmp).load(item, d, thread_data); @@ -45,7 +45,10 @@ bool test_load_blocked_striped() { sycl::host_accessor data_accessor(buffer, sycl::read_only); const int *ptr = data_accessor.get_multi_ptr(); - for (int i = 0; i < 512; ++i) { + + if ( T == dpct::group::load_algorithm::BLOCK_LOAD_DIRECT) + { + for (int i = 0; i < 512; ++i) { if (ptr[i] != i) { std::cout <<" failed\n"; std::ostream_iterator Iter(std::cout, ", "); @@ -57,6 +60,23 @@ bool test_load_blocked_striped() { std::cout <<" pass\n"; return true; + } + else{ + + int expected[512]={0, 128, 256, 384, 1, 129, 257, 385, 2, 130, 258, 386, 3, 131, 259, 387, 4, 132, 260, 388, 5, 133, 261, 389, 6, 134, 262, 390, 7, 135, 263, 391, 8, 136, 264, 392, 9, 137, 265, 393, 10, 138, 266, 394, 11, 139, 267, 395, 12, 140, 268, 396, 13, 141, 269, 397, 14, 142, 270, 398, 15, 143, 271, 399, 4, 144, 272, 400, 132, 145, 273, 401, 260, 146, 274, 402, 388, 147, 275, 403, 5, 148, 276, 404, 133, 149, 277, 405, 261, 150, 278, 406, 389, 151, 279, 407, 6, 152, 280, 408, 134, 153, 281, 409, 262, 154, 282, 410, 390, 155, 283, 411, 7, 156, 284, 412, 135, 157, 285, 413, 263, 158, 286, 414, 391, 159, 287, 415, 8, 160, 288, 416, 136, 161, 289, 417, 264, 162, 290, 418, 392, 163, 291, 419, 9, 164, 292, 420, 137, 165, 293, 421, 265, 166, 294, 422, 393, 167, 295, 423, 10, 168, 296, 424, 138, 169, 297, 425, 266, 170, 298, 426, 394, 171, 299, 427, 11, 172, 300, 428, 139, 173, 301, 429, 267, 174, 302, 430, 395, 175, 303, 431, 12, 11, 304, 432, 140, 172, 305, 433, 268, 300, 306, 434, 396, 428, 307, 435, 13, 139, 308, 436, 141, 173, 309, 437, 269, 301, 310, 438, 397, 429, 311, 439, 14, 267, 312, 440, 142, 174, 313, 441, 270, 302, 314, 442, 398, 430, 315, 443, 15, 395, 316, 444, 143, 175, 317, 445, 271, 303, 318, 446, 399, 431, 319, 447, 4, 12, 320, 448, 144, 11, 321, 449, 272, 304, 322, 450, 400, 432, 323, 451, 132, 140, 324, 452, 145, 172, 325, 453, 273, 305, 326, 454, 401, 433, 327, 455, 260, 268, 328, 456, 146, 300, 329, 457, 274, 306, 330, 458, 402, 434, 331, 459, 388, 396, 332, 460, 147, 428, 333, 461, 275, 307, 334, 462, 403, 435, 335, 463, 5, 13, 336, 464, 148, 139, 337, 465, 276, 308, 338, 466, 404, 436, 339, 467, 133, 141, 340, 468, 149, 173, 341, 469, 277, 309, 342, 470, 405, 437, 343, 471, 261, 269, 344, 472, 150, 301, 345, 473, 278, 310, 346, 474, 406, 438, 347, 475, 389, 397, 348, 476, 151, 429, 349, 477, 279, 311, 350, 478, 407, 439, 351, 479, 6, 14, 261, 480, 152, 267, 269, 481, 280, 312, 344, 482, 408, 440, 472, 483, 134, 142, 150, 484, 153, 174, 301, 485, 281, 313, 345, 486, 409, 441, 473, 487, 262, 270, 278, 488, 154, 302, 310, 489, 282, 314, 346, 490, 410, 442, 474, 491, 390, 398, 406, 492, 155, 430, 438, 493, 283, 315, 347, 494, 411, 443, 475, 495, 7, 15, 389, 496, 156, 395, 397, 497, 284, 316, 348, 498, 412, 444, 476, 499, 135, 143, 151, 500, 157, 175, 429, 501, 285, 317, 349, 502, 413, 445, 477, 503, 263, 271, 279, 504, 158, 303, 311, 505, 286, 318, 350, 506, 414, 446, 478, 507, 391, 399, 407, 508, 159, 431, 439, 509, 287, 319, 351, 510, 415, 447, 479, 511}; + for (int i = 0; i < 512; ++i) { + if (ptr[i] != expected[i]) { + std::cout <<" failed\n"; + std::ostream_iterator Iter(std::cout, ", "); + std::copy(ptr, ptr + 512, Iter); + std::cout << std::endl; + return false; + } + } + + std::cout <<" pass\n"; + return true; + } } bool test_load_subgroup_striped_standalone() { @@ -85,13 +105,8 @@ bool test_load_subgroup_striped_standalone() { sycl::host_accessor data_accessor(buffer, sycl::read_only); const int *ptr = data_accessor.get_multi_ptr(); - int expected[512]; - for (int i = 0; i < 128; i++) { - expected[4 * i + 0] = i; - expected[4 * i + 1] = 4 * i + 1; - expected[4 * i + 2] = 4 * i + 2; - expected[4 * i + 3] = 4 * i + 3; - } + int expected[512] = {0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51, 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55, 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59, 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63, 64, 80, 96, 112, 65, 81, 97, 113, 66, 82, 98, 114, 67, 83, 99, 115, 68, 84, 100, 116, 69, 85, 101, 117, 70, 86, 102, 118, 71, 87, 103, 119, 72, 88, 104, 120, 73, 89, 105, 121, 74, 90, 106, 122, 75, 91, 107, 123, 76, 92, 108, 124, 77, 93, 109, 125, 78, 94, 110, 126, 79, 95, 111, 127, 128, 144, 160, 176, 129, 145, 161, 177, 130, 146, 162, 178, 131, 147, 163, 179, 132, 148, 164, 180, 133, 149, 165, 181, 134, 150, 166, 182, 135, 151, 167, 183, 136, 152, 168, 184, 137, 153, 169, 185, 138, 154, 170, 186, 139, 155, 171, 187, 140, 156, 172, 188, 141, 157, 173, 189, 142, 158, 174, 190, 143, 159, 175, 191, 192, 208, 224, 240, 193, 209, 225, 241, 194, 210, 226, 242, 195, 211, 227, 243, 196, 212, 228, 244, 197, 213, 229, 245, 198, 214, 230, 246, 199, 215, 231, 247, 200, 216, 232, 248, 201, 217, 233, 249, 202, 218, 234, 250, 203, 219, 235, 251, 204, 220, 236, 252, 205, 221, 237, 253, 206, 222, 238, 254, 207, 223, 239, 255, 256, 272, 288, 304, 257, 273, 289, 305, 258, 274, 290, 306, 259, 275, 291, 307, 260, 276, 292, 308, 261, 277, 293, 309, 262, 278, 294, 310, 263, 279, 295, 311, 264, 280, 296, 312, 265, 281, 297, 313, 266, 282, 298, 314, 267, 283, 299, 315, 268, 284, 300, 316, 269, 285, 301, 317, 270, 286, 302, 318, 271, 287, 303, 319, 320, 336, 352, 368, 321, 337, 353, 369, 322, 338, 354, 370, 323, 339, 355, 371, 324, 340, 356, 372, 325, 341, 357, 373, 326, 342, 358, 374, 327, 343, 359, 375, 328, 344, 360, 376, 329, 345, 361, 377, 330, 346, 362, 378, 331, 347, 363, 379, 332, 348, 364, 380, 333, 349, 365, 381, 334, 350, 366, 382, 335, 351, 367, 383, 384, 400, 416, 432, 385, 401, 417, 433, 386, 402, 418, 434, 387, 403, 419, 435, 388, 404, 420, 436, 389, 405, 421, 437, 390, 406, 422, 438, 391, 407, 423, 439, 392, 408, 424, 440, 393, 409, 425, 441, 394, 410, 426, 442, 395, 411, 427, 443, 396, 412, 428, 444, 397, 413, 429, 445, 398, 414, 430, 446, 399, 415, 431, 447, 448, 464, 480, 496, 449, 465, 481, 497, 450, 466, 482, 498, 451, 467, 483, 499, 452, 468, 484, 500, 453, 469, 485, 501, 454, 470, 486, 502, 455, 471, 487, 503, 456, 472, 488, 504, 457, 473, 489, 505, 458, 474, 490, 506, 459, 475, 491, 507, 460, 476, 492, 508, 461, 477, 493, 509, 462, 478, 494, 510, 463, 479, 495, 511}; + for (int i = 0; i < 512; ++i) { if (ptr[i] != expected[i]) { std::cout <<" failed\n"; @@ -153,21 +168,8 @@ bool test_load_blocked_striped_standalone() { return true; } else{ - int expected[512]; - - for (int i = 4; i < 128; ++i) { - expected[4 * i + 0] = 4 * i + (128 * 0); - expected[4 * i + 1] = 4 * i + 1 + (128 * 1); - expected[4 * i + 2] = 4 * i + 2 + (128 * 2); - expected[4 * i + 3] = 4 * i + 3 + (128 * 3); - - } + int expected[512]={0, 128, 256, 384, 1, 129, 257, 385, 2, 130, 258, 386, 3, 131, 259, 387, 4, 132, 260, 388, 5, 133, 261, 389, 6, 134, 262, 390, 7, 135, 263, 391, 8, 136, 264, 392, 9, 137, 265, 393, 10, 138, 266, 394, 11, 139, 267, 395, 12, 140, 268, 396, 13, 141, 269, 397, 14, 142, 270, 398, 15, 143, 271, 399, 4, 144, 272, 400, 132, 145, 273, 401, 260, 146, 274, 402, 388, 147, 275, 403, 5, 148, 276, 404, 133, 149, 277, 405, 261, 150, 278, 406, 389, 151, 279, 407, 6, 152, 280, 408, 134, 153, 281, 409, 262, 154, 282, 410, 390, 155, 283, 411, 7, 156, 284, 412, 135, 157, 285, 413, 263, 158, 286, 414, 391, 159, 287, 415, 8, 160, 288, 416, 136, 161, 289, 417, 264, 162, 290, 418, 392, 163, 291, 419, 9, 164, 292, 420, 137, 165, 293, 421, 265, 166, 294, 422, 393, 167, 295, 423, 10, 168, 296, 424, 138, 169, 297, 425, 266, 170, 298, 426, 394, 171, 299, 427, 11, 172, 300, 428, 139, 173, 301, 429, 267, 174, 302, 430, 395, 175, 303, 431, 12, 11, 304, 432, 140, 172, 305, 433, 268, 300, 306, 434, 396, 428, 307, 435, 13, 139, 308, 436, 141, 173, 309, 437, 269, 301, 310, 438, 397, 429, 311, 439, 14, 267, 312, 440, 142, 174, 313, 441, 270, 302, 314, 442, 398, 430, 315, 443, 15, 395, 316, 444, 143, 175, 317, 445, 271, 303, 318, 446, 399, 431, 319, 447, 4, 12, 320, 448, 144, 11, 321, 449, 272, 304, 322, 450, 400, 432, 323, 451, 132, 140, 324, 452, 145, 172, 325, 453, 273, 305, 326, 454, 401, 433, 327, 455, 260, 268, 328, 456, 146, 300, 329, 457, 274, 306, 330, 458, 402, 434, 331, 459, 388, 396, 332, 460, 147, 428, 333, 461, 275, 307, 334, 462, 403, 435, 335, 463, 5, 13, 336, 464, 148, 139, 337, 465, 276, 308, 338, 466, 404, 436, 339, 467, 133, 141, 340, 468, 149, 173, 341, 469, 277, 309, 342, 470, 405, 437, 343, 471, 261, 269, 344, 472, 150, 301, 345, 473, 278, 310, 346, 474, 406, 438, 347, 475, 389, 397, 348, 476, 151, 429, 349, 477, 279, 311, 350, 478, 407, 439, 351, 479, 6, 14, 261, 480, 152, 267, 269, 481, 280, 312, 344, 482, 408, 440, 472, 483, 134, 142, 150, 484, 153, 174, 301, 485, 281, 313, 345, 486, 409, 441, 473, 487, 262, 270, 278, 488, 154, 302, 310, 489, 282, 314, 346, 490, 410, 442, 474, 491, 390, 398, 406, 492, 155, 430, 438, 493, 283, 315, 347, 494, 411, 443, 475, 495, 7, 15, 389, 496, 156, 395, 397, 497, 284, 316, 348, 498, 412, 444, 476, 499, 135, 143, 151, 500, 157, 175, 429, 501, 285, 317, 349, 502, 413, 445, 477, 503, 263, 271, 279, 504, 158, 303, 311, 505, 286, 318, 350, 506, 414, 446, 478, 507, 391, 399, 407, 508, 159, 431, 439, 509, 287, 319, 351, 510, 415, 447, 479, 511}; - for(int i=0;i<512;i++){std::cout<() && test_load_blocked_striped() && - test_load_subgroup_striped_standalone() && + return !(test_load_blocked_striped() && test_load_blocked_striped() && test_load_subgroup_striped_standalone() && test_load_blocked_striped_standalone() && test_load_blocked_striped_standalone()); } From c5447c0f2076912f6e7b33389ceffb3dfd97c178 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Tue, 30 Apr 2024 17:39:46 +0530 Subject: [PATCH 15/36] revert accidental deletion --- help_function/help_function.xml | 1 + 1 file changed, 1 insertion(+) diff --git a/help_function/help_function.xml b/help_function/help_function.xml index 2e9f9f2f8..23e2fe5ae 100644 --- a/help_function/help_function.xml +++ b/help_function/help_function.xml @@ -150,6 +150,7 @@ + From d99a8288bd090f04d90102233d062364a339c325 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Wed, 1 May 2024 23:56:04 +0530 Subject: [PATCH 16/36] replace striped expected output --- help_function/src/onedpl_test_group_load.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/help_function/src/onedpl_test_group_load.cpp b/help_function/src/onedpl_test_group_load.cpp index 267b971dc..3dae2e254 100644 --- a/help_function/src/onedpl_test_group_load.cpp +++ b/help_function/src/onedpl_test_group_load.cpp @@ -63,7 +63,7 @@ bool test_load_blocked_striped() { } else{ - int expected[512]={0, 128, 256, 384, 1, 129, 257, 385, 2, 130, 258, 386, 3, 131, 259, 387, 4, 132, 260, 388, 5, 133, 261, 389, 6, 134, 262, 390, 7, 135, 263, 391, 8, 136, 264, 392, 9, 137, 265, 393, 10, 138, 266, 394, 11, 139, 267, 395, 12, 140, 268, 396, 13, 141, 269, 397, 14, 142, 270, 398, 15, 143, 271, 399, 4, 144, 272, 400, 132, 145, 273, 401, 260, 146, 274, 402, 388, 147, 275, 403, 5, 148, 276, 404, 133, 149, 277, 405, 261, 150, 278, 406, 389, 151, 279, 407, 6, 152, 280, 408, 134, 153, 281, 409, 262, 154, 282, 410, 390, 155, 283, 411, 7, 156, 284, 412, 135, 157, 285, 413, 263, 158, 286, 414, 391, 159, 287, 415, 8, 160, 288, 416, 136, 161, 289, 417, 264, 162, 290, 418, 392, 163, 291, 419, 9, 164, 292, 420, 137, 165, 293, 421, 265, 166, 294, 422, 393, 167, 295, 423, 10, 168, 296, 424, 138, 169, 297, 425, 266, 170, 298, 426, 394, 171, 299, 427, 11, 172, 300, 428, 139, 173, 301, 429, 267, 174, 302, 430, 395, 175, 303, 431, 12, 11, 304, 432, 140, 172, 305, 433, 268, 300, 306, 434, 396, 428, 307, 435, 13, 139, 308, 436, 141, 173, 309, 437, 269, 301, 310, 438, 397, 429, 311, 439, 14, 267, 312, 440, 142, 174, 313, 441, 270, 302, 314, 442, 398, 430, 315, 443, 15, 395, 316, 444, 143, 175, 317, 445, 271, 303, 318, 446, 399, 431, 319, 447, 4, 12, 320, 448, 144, 11, 321, 449, 272, 304, 322, 450, 400, 432, 323, 451, 132, 140, 324, 452, 145, 172, 325, 453, 273, 305, 326, 454, 401, 433, 327, 455, 260, 268, 328, 456, 146, 300, 329, 457, 274, 306, 330, 458, 402, 434, 331, 459, 388, 396, 332, 460, 147, 428, 333, 461, 275, 307, 334, 462, 403, 435, 335, 463, 5, 13, 336, 464, 148, 139, 337, 465, 276, 308, 338, 466, 404, 436, 339, 467, 133, 141, 340, 468, 149, 173, 341, 469, 277, 309, 342, 470, 405, 437, 343, 471, 261, 269, 344, 472, 150, 301, 345, 473, 278, 310, 346, 474, 406, 438, 347, 475, 389, 397, 348, 476, 151, 429, 349, 477, 279, 311, 350, 478, 407, 439, 351, 479, 6, 14, 261, 480, 152, 267, 269, 481, 280, 312, 344, 482, 408, 440, 472, 483, 134, 142, 150, 484, 153, 174, 301, 485, 281, 313, 345, 486, 409, 441, 473, 487, 262, 270, 278, 488, 154, 302, 310, 489, 282, 314, 346, 490, 410, 442, 474, 491, 390, 398, 406, 492, 155, 430, 438, 493, 283, 315, 347, 494, 411, 443, 475, 495, 7, 15, 389, 496, 156, 395, 397, 497, 284, 316, 348, 498, 412, 444, 476, 499, 135, 143, 151, 500, 157, 175, 429, 501, 285, 317, 349, 502, 413, 445, 477, 503, 263, 271, 279, 504, 158, 303, 311, 505, 286, 318, 350, 506, 414, 446, 478, 507, 391, 399, 407, 508, 159, 431, 439, 509, 287, 319, 351, 510, 415, 447, 479, 511}; + int expected[512]={0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51, 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55, 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59, 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63, 64, 80, 96, 112, 65, 81, 97, 113, 66, 82, 98, 114, 67, 83, 99, 115, 68, 84, 100, 116, 69, 85, 101, 117, 70, 86, 102, 118, 71, 87, 103, 119, 72, 88, 104, 120, 73, 89, 105, 121, 74, 90, 106, 122, 75, 91, 107, 123, 76, 92, 108, 124, 77, 93, 109, 125, 78, 94, 110, 126, 79, 95, 111, 127, 128, 144, 160, 176, 129, 145, 161, 177, 130, 146, 162, 178, 131, 147, 163, 179, 132, 148, 164, 180, 133, 149, 165, 181, 134, 150, 166, 182, 135, 151, 167, 183, 136, 152, 168, 184, 137, 153, 169, 185, 138, 154, 170, 186, 139, 155, 171, 187, 140, 156, 172, 188, 141, 157, 173, 189, 142, 158, 174, 190, 143, 159, 175, 191, 192, 208, 224, 240, 193, 209, 225, 241, 194, 210, 226, 242, 195, 211, 227, 243, 196, 212, 228, 244, 197, 213, 229, 245, 198, 214, 230, 246, 199, 215, 231, 247, 200, 216, 232, 248, 201, 217, 233, 249, 202, 218, 234, 250, 203, 219, 235, 251, 204, 220, 236, 252, 205, 221, 237, 253, 206, 222, 238, 254, 207, 223, 239, 255, 256, 272, 288, 304, 257, 273, 289, 305, 258, 274, 290, 306, 259, 275, 291, 307, 260, 276, 292, 308, 261, 277, 293, 309, 262, 278, 294, 310, 263, 279, 295, 311, 264, 280, 296, 312, 265, 281, 297, 313, 266, 282, 298, 314, 267, 283, 299, 315, 268, 284, 300, 316, 269, 285, 301, 317, 270, 286, 302, 318, 271, 287, 303, 319, 320, 336, 352, 368, 321, 337, 353, 369, 322, 338, 354, 370, 323, 339, 355, 371, 324, 340, 356, 372, 325, 341, 357, 373, 326, 342, 358, 374, 327, 343, 359, 375, 328, 344, 360, 376, 329, 345, 361, 377, 330, 346, 362, 378, 331, 347, 363, 379, 332, 348, 364, 380, 333, 349, 365, 381, 334, 350, 366, 382, 335, 351, 367, 383, 384, 400, 416, 432, 385, 401, 417, 433, 386, 402, 418, 434, 387, 403, 419, 435, 388, 404, 420, 436, 389, 405, 421, 437, 390, 406, 422, 438, 391, 407, 423, 439, 392, 408, 424, 440, 393, 409, 425, 441, 394, 410, 426, 442, 395, 411, 427, 443, 396, 412, 428, 444, 397, 413, 429, 445, 398, 414, 430, 446, 399, 415, 431, 447, 448, 464, 480, 496, 449, 465, 481, 497, 450, 466, 482, 498, 451, 467, 483, 499, 452, 468, 484, 500, 453, 469, 485, 501, 454, 470, 486, 502, 455, 471, 487, 503, 456, 472, 488, 504, 457, 473, 489, 505, 458, 474, 490, 506, 459, 475, 491, 507, 460, 476, 492, 508, 461, 477, 493, 509, 462, 478, 494, 510, 463, 479, 495, 511}; for (int i = 0; i < 512; ++i) { if (ptr[i] != expected[i]) { std::cout <<" failed\n"; From 5ddb77968d2c1c9540329b6f079e0fa41f03a7cc Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Thu, 2 May 2024 15:42:55 +0530 Subject: [PATCH 17/36] update striped test case --- help_function/src/onedpl_test_group_load.cpp | 35 +++++++++++++------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/help_function/src/onedpl_test_group_load.cpp b/help_function/src/onedpl_test_group_load.cpp index 3dae2e254..c31034b92 100644 --- a/help_function/src/onedpl_test_group_load.cpp +++ b/help_function/src/onedpl_test_group_load.cpp @@ -35,9 +35,10 @@ bool test_load_blocked_striped() { auto *tmp = tacc.get_multi_ptr().get(); group_load(tmp).load(item, d, thread_data); // Write thread_data of each work item at index to the global buffer - int global_index = item.get_global_linear_id() * 4; // Each thread_data has 4 elements + int global_index = item.get_group(2)*item.get_local_range().get(2) + item.get_local_id(2); // Each thread_data has 4 elements + #pragma unroll for (int i = 0; i < 4; ++i) { - data_accessor[global_index + i] = thread_data[i]; + data_accessor[global_index * 4 + i] = thread_data[i]; } }); }); @@ -62,8 +63,12 @@ bool test_load_blocked_striped() { return true; } else{ - - int expected[512]={0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51, 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55, 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59, 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63, 64, 80, 96, 112, 65, 81, 97, 113, 66, 82, 98, 114, 67, 83, 99, 115, 68, 84, 100, 116, 69, 85, 101, 117, 70, 86, 102, 118, 71, 87, 103, 119, 72, 88, 104, 120, 73, 89, 105, 121, 74, 90, 106, 122, 75, 91, 107, 123, 76, 92, 108, 124, 77, 93, 109, 125, 78, 94, 110, 126, 79, 95, 111, 127, 128, 144, 160, 176, 129, 145, 161, 177, 130, 146, 162, 178, 131, 147, 163, 179, 132, 148, 164, 180, 133, 149, 165, 181, 134, 150, 166, 182, 135, 151, 167, 183, 136, 152, 168, 184, 137, 153, 169, 185, 138, 154, 170, 186, 139, 155, 171, 187, 140, 156, 172, 188, 141, 157, 173, 189, 142, 158, 174, 190, 143, 159, 175, 191, 192, 208, 224, 240, 193, 209, 225, 241, 194, 210, 226, 242, 195, 211, 227, 243, 196, 212, 228, 244, 197, 213, 229, 245, 198, 214, 230, 246, 199, 215, 231, 247, 200, 216, 232, 248, 201, 217, 233, 249, 202, 218, 234, 250, 203, 219, 235, 251, 204, 220, 236, 252, 205, 221, 237, 253, 206, 222, 238, 254, 207, 223, 239, 255, 256, 272, 288, 304, 257, 273, 289, 305, 258, 274, 290, 306, 259, 275, 291, 307, 260, 276, 292, 308, 261, 277, 293, 309, 262, 278, 294, 310, 263, 279, 295, 311, 264, 280, 296, 312, 265, 281, 297, 313, 266, 282, 298, 314, 267, 283, 299, 315, 268, 284, 300, 316, 269, 285, 301, 317, 270, 286, 302, 318, 271, 287, 303, 319, 320, 336, 352, 368, 321, 337, 353, 369, 322, 338, 354, 370, 323, 339, 355, 371, 324, 340, 356, 372, 325, 341, 357, 373, 326, 342, 358, 374, 327, 343, 359, 375, 328, 344, 360, 376, 329, 345, 361, 377, 330, 346, 362, 378, 331, 347, 363, 379, 332, 348, 364, 380, 333, 349, 365, 381, 334, 350, 366, 382, 335, 351, 367, 383, 384, 400, 416, 432, 385, 401, 417, 433, 386, 402, 418, 434, 387, 403, 419, 435, 388, 404, 420, 436, 389, 405, 421, 437, 390, 406, 422, 438, 391, 407, 423, 439, 392, 408, 424, 440, 393, 409, 425, 441, 394, 410, 426, 442, 395, 411, 427, 443, 396, 412, 428, 444, 397, 413, 429, 445, 398, 414, 430, 446, 399, 415, 431, 447, 448, 464, 480, 496, 449, 465, 481, 497, 450, 466, 482, 498, 451, 467, 483, 499, 452, 468, 484, 500, 453, 469, 485, 501, 454, 470, 486, 502, 455, 471, 487, 503, 456, 472, 488, 504, 457, 473, 489, 505, 458, 474, 490, 506, 459, 475, 491, 507, 460, 476, 492, 508, 461, 477, 493, 509, 462, 478, 494, 510, 463, 479, 495, 511}; + int expected[512]; + for (int i = 0;i < 128; ++i){ + for(int j=0;j < 4; ++j){ + expected[i * 4 +j] = j * 128 +i; + } + } for (int i = 0; i < 512; ++i) { if (ptr[i] != expected[i]) { std::cout <<" failed\n"; @@ -95,9 +100,10 @@ bool test_load_subgroup_striped_standalone() { auto *d = dacc.get_multi_ptr().get(); dpct::group::uninitialized_load_subgroup_striped<4, int>(item, d, thread_data); // Write thread_data of each work item at index to the global buffer - int global_index = item.get_global_linear_id() * 4; // Each thread_data has 4 elements + int global_index = item.get_group(2)*item.get_local_range().get(2) + item.get_local_id(2); // Each thread_data has 4 elements + #pragma unroll for (int i = 0; i < 4; ++i) { - dacc[global_index + i] = thread_data[i]; + dacc[global_index * 4 + i] = thread_data[i]; } }); }); @@ -138,13 +144,14 @@ bool test_load_blocked_striped_standalone() { int thread_data[4]; auto *d = dacc.get_multi_ptr().get(); if( T == dpct::group::load_algorithm::BLOCK_LOAD_DIRECT) - {dpct::group::load_blocked<4, T, int>(item, d, thread_data);} + {dpct::group::load_blocked<4, int>(item, d, thread_data);} else - {dpct::group::load_striped<4, T, int>(item, d, thread_data);} + {dpct::group::load_striped<4, int>(item, d, thread_data);} // Write thread_data of each work item at index to the global buffer - int global_index = item.get_global_linear_id() * 4; // Each thread_data has 4 elements + int global_index = int global_index = item.get_group(2)*item.get_local_range().get(2) + item.get_local_id(2); // Each thread_data has 4 elements + #pragma unroll for (int i = 0; i < 4; ++i) { - dacc[global_index + i] = thread_data[i]; + dacc[global_index * 4 + i] = thread_data[i]; } }); }); @@ -168,8 +175,12 @@ bool test_load_blocked_striped_standalone() { return true; } else{ - int expected[512]={0, 128, 256, 384, 1, 129, 257, 385, 2, 130, 258, 386, 3, 131, 259, 387, 4, 132, 260, 388, 5, 133, 261, 389, 6, 134, 262, 390, 7, 135, 263, 391, 8, 136, 264, 392, 9, 137, 265, 393, 10, 138, 266, 394, 11, 139, 267, 395, 12, 140, 268, 396, 13, 141, 269, 397, 14, 142, 270, 398, 15, 143, 271, 399, 4, 144, 272, 400, 132, 145, 273, 401, 260, 146, 274, 402, 388, 147, 275, 403, 5, 148, 276, 404, 133, 149, 277, 405, 261, 150, 278, 406, 389, 151, 279, 407, 6, 152, 280, 408, 134, 153, 281, 409, 262, 154, 282, 410, 390, 155, 283, 411, 7, 156, 284, 412, 135, 157, 285, 413, 263, 158, 286, 414, 391, 159, 287, 415, 8, 160, 288, 416, 136, 161, 289, 417, 264, 162, 290, 418, 392, 163, 291, 419, 9, 164, 292, 420, 137, 165, 293, 421, 265, 166, 294, 422, 393, 167, 295, 423, 10, 168, 296, 424, 138, 169, 297, 425, 266, 170, 298, 426, 394, 171, 299, 427, 11, 172, 300, 428, 139, 173, 301, 429, 267, 174, 302, 430, 395, 175, 303, 431, 12, 11, 304, 432, 140, 172, 305, 433, 268, 300, 306, 434, 396, 428, 307, 435, 13, 139, 308, 436, 141, 173, 309, 437, 269, 301, 310, 438, 397, 429, 311, 439, 14, 267, 312, 440, 142, 174, 313, 441, 270, 302, 314, 442, 398, 430, 315, 443, 15, 395, 316, 444, 143, 175, 317, 445, 271, 303, 318, 446, 399, 431, 319, 447, 4, 12, 320, 448, 144, 11, 321, 449, 272, 304, 322, 450, 400, 432, 323, 451, 132, 140, 324, 452, 145, 172, 325, 453, 273, 305, 326, 454, 401, 433, 327, 455, 260, 268, 328, 456, 146, 300, 329, 457, 274, 306, 330, 458, 402, 434, 331, 459, 388, 396, 332, 460, 147, 428, 333, 461, 275, 307, 334, 462, 403, 435, 335, 463, 5, 13, 336, 464, 148, 139, 337, 465, 276, 308, 338, 466, 404, 436, 339, 467, 133, 141, 340, 468, 149, 173, 341, 469, 277, 309, 342, 470, 405, 437, 343, 471, 261, 269, 344, 472, 150, 301, 345, 473, 278, 310, 346, 474, 406, 438, 347, 475, 389, 397, 348, 476, 151, 429, 349, 477, 279, 311, 350, 478, 407, 439, 351, 479, 6, 14, 261, 480, 152, 267, 269, 481, 280, 312, 344, 482, 408, 440, 472, 483, 134, 142, 150, 484, 153, 174, 301, 485, 281, 313, 345, 486, 409, 441, 473, 487, 262, 270, 278, 488, 154, 302, 310, 489, 282, 314, 346, 490, 410, 442, 474, 491, 390, 398, 406, 492, 155, 430, 438, 493, 283, 315, 347, 494, 411, 443, 475, 495, 7, 15, 389, 496, 156, 395, 397, 497, 284, 316, 348, 498, 412, 444, 476, 499, 135, 143, 151, 500, 157, 175, 429, 501, 285, 317, 349, 502, 413, 445, 477, 503, 263, 271, 279, 504, 158, 303, 311, 505, 286, 318, 350, 506, 414, 446, 478, 507, 391, 399, 407, 508, 159, 431, 439, 509, 287, 319, 351, 510, 415, 447, 479, 511}; - + int expected[512]; + for (int i = 0;i < 128; ++i){ + for(int j=0;j < 4; ++j){ + expected[i * 4 +j] = j * 128 +i; + } + } for (int i = 0; i < 512; ++i) { if (ptr[i] != expected[i]) { std::cout <<" failed\n"; From ab0277a9ae014077b0d7011397a47040b899be15 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Thu, 2 May 2024 22:34:12 +0530 Subject: [PATCH 18/36] use subgroup size --- help_function/src/onedpl_test_group_load.cpp | 22 ++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/help_function/src/onedpl_test_group_load.cpp b/help_function/src/onedpl_test_group_load.cpp index c31034b92..3c5357536 100644 --- a/help_function/src/onedpl_test_group_load.cpp +++ b/help_function/src/onedpl_test_group_load.cpp @@ -91,6 +91,7 @@ bool test_load_subgroup_striped_standalone() { for (int i = 0; i < 512; i++) data[i] = i; sycl::buffer buffer(data, 512); + sycl::buffer sg_sz_buf(1); q.submit([&](sycl::handler &h) { sycl::accessor dacc(buffer, h, sycl::read_write); h.parallel_for( @@ -98,6 +99,11 @@ bool test_load_subgroup_striped_standalone() { [=](sycl::nd_item<3> item) { int thread_data[4]; auto *d = dacc.get_multi_ptr().get(); + auto sg_sz_acc = sg_sz_buf.get_access.(h); + size_t gid = item.get_global_linear_id(); + if (gid == 0) { + sg_sz_acc[0] = item.get_sub_group().get_local_linear_range(); + } dpct::group::uninitialized_load_subgroup_striped<4, int>(item, d, thread_data); // Write thread_data of each work item at index to the global buffer int global_index = item.get_group(2)*item.get_local_range().get(2) + item.get_local_id(2); // Each thread_data has 4 elements @@ -111,8 +117,20 @@ bool test_load_subgroup_striped_standalone() { sycl::host_accessor data_accessor(buffer, sycl::read_only); const int *ptr = data_accessor.get_multi_ptr(); - int expected[512] = {0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51, 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55, 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59, 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63, 64, 80, 96, 112, 65, 81, 97, 113, 66, 82, 98, 114, 67, 83, 99, 115, 68, 84, 100, 116, 69, 85, 101, 117, 70, 86, 102, 118, 71, 87, 103, 119, 72, 88, 104, 120, 73, 89, 105, 121, 74, 90, 106, 122, 75, 91, 107, 123, 76, 92, 108, 124, 77, 93, 109, 125, 78, 94, 110, 126, 79, 95, 111, 127, 128, 144, 160, 176, 129, 145, 161, 177, 130, 146, 162, 178, 131, 147, 163, 179, 132, 148, 164, 180, 133, 149, 165, 181, 134, 150, 166, 182, 135, 151, 167, 183, 136, 152, 168, 184, 137, 153, 169, 185, 138, 154, 170, 186, 139, 155, 171, 187, 140, 156, 172, 188, 141, 157, 173, 189, 142, 158, 174, 190, 143, 159, 175, 191, 192, 208, 224, 240, 193, 209, 225, 241, 194, 210, 226, 242, 195, 211, 227, 243, 196, 212, 228, 244, 197, 213, 229, 245, 198, 214, 230, 246, 199, 215, 231, 247, 200, 216, 232, 248, 201, 217, 233, 249, 202, 218, 234, 250, 203, 219, 235, 251, 204, 220, 236, 252, 205, 221, 237, 253, 206, 222, 238, 254, 207, 223, 239, 255, 256, 272, 288, 304, 257, 273, 289, 305, 258, 274, 290, 306, 259, 275, 291, 307, 260, 276, 292, 308, 261, 277, 293, 309, 262, 278, 294, 310, 263, 279, 295, 311, 264, 280, 296, 312, 265, 281, 297, 313, 266, 282, 298, 314, 267, 283, 299, 315, 268, 284, 300, 316, 269, 285, 301, 317, 270, 286, 302, 318, 271, 287, 303, 319, 320, 336, 352, 368, 321, 337, 353, 369, 322, 338, 354, 370, 323, 339, 355, 371, 324, 340, 356, 372, 325, 341, 357, 373, 326, 342, 358, 374, 327, 343, 359, 375, 328, 344, 360, 376, 329, 345, 361, 377, 330, 346, 362, 378, 331, 347, 363, 379, 332, 348, 364, 380, 333, 349, 365, 381, 334, 350, 366, 382, 335, 351, 367, 383, 384, 400, 416, 432, 385, 401, 417, 433, 386, 402, 418, 434, 387, 403, 419, 435, 388, 404, 420, 436, 389, 405, 421, 437, 390, 406, 422, 438, 391, 407, 423, 439, 392, 408, 424, 440, 393, 409, 425, 441, 394, 410, 426, 442, 395, 411, 427, 443, 396, 412, 428, 444, 397, 413, 429, 445, 398, 414, 430, 446, 399, 415, 431, 447, 448, 464, 480, 496, 449, 465, 481, 497, 450, 466, 482, 498, 451, 467, 483, 499, 452, 468, 484, 500, 453, 469, 485, 501, 454, 470, 486, 502, 455, 471, 487, 503, 456, 472, 488, 504, 457, 473, 489, 505, 458, 474, 490, 506, 459, 475, 491, 507, 460, 476, 492, 508, 461, 477, 493, 509, 462, 478, 494, 510, 463, 479, 495, 511}; - + auto sg_sz = sg_sz_acc.get_host_access()[0]; + int expected[512]; + int idx=0; + for(int i=0;i< 8 ;++i){ + for(int j =0;j < sg_sz;++j){ + for(int k =0;k < 4;++k){ + int val = i * sg_sz * 4 + j * 4 + k; + if (idx < 512){ + expected[idx] = val; + idx++; + } + } + } + } for (int i = 0; i < 512; ++i) { if (ptr[i] != expected[i]) { std::cout <<" failed\n"; From 300ac08135307cc80414410e022ed522fac86e50 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Sun, 5 May 2024 23:38:02 +0530 Subject: [PATCH 19/36] re-add to prev line --- help_function/help_function.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/help_function/help_function.xml b/help_function/help_function.xml index 23e2fe5ae..82429ffb1 100644 --- a/help_function/help_function.xml +++ b/help_function/help_function.xml @@ -149,9 +149,9 @@ + - From 9c21a36085fd42c009469f74fa3bb4ff6bd53a93 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 6 May 2024 08:26:58 +0530 Subject: [PATCH 20/36] review commits --- help_function/src/onedpl_test_group_load.cpp | 40 ++++++++++---------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/help_function/src/onedpl_test_group_load.cpp b/help_function/src/onedpl_test_group_load.cpp index 3c5357536..6118b210f 100644 --- a/help_function/src/onedpl_test_group_load.cpp +++ b/help_function/src/onedpl_test_group_load.cpp @@ -14,7 +14,7 @@ #include template -bool test_load_blocked_striped() { +bool test_group_load() { // Tests dpct::group::load_algorithm::BLOCK_LOAD_DIRECT & dpct::group::load_algorithm::BLOCK_LOAD_STRIPED // in its entirety as API functions sycl::queue q; @@ -64,9 +64,11 @@ bool test_load_blocked_striped() { } else{ int expected[512]; - for (int i = 0;i < 128; ++i){ - for(int j=0;j < 4; ++j){ - expected[i * 4 +j] = j * 128 +i; + int num_threads = 128; + int items_per_thread = 4; + for (int i = 0;i < num_threads; ++i){ + for(int j=0;j < items_per_thread; ++j){ + expected[i * items_per_thread +j] = j * num_threads +i; } } for (int i = 0; i < 512; ++i) { @@ -119,18 +121,13 @@ bool test_load_subgroup_striped_standalone() { const int *ptr = data_accessor.get_multi_ptr(); auto sg_sz = sg_sz_acc.get_host_access()[0]; int expected[512]; - int idx=0; - for(int i=0;i< 8 ;++i){ - for(int j =0;j < sg_sz;++j){ - for(int k =0;k < 4;++k){ - int val = i * sg_sz * 4 + j * 4 + k; - if (idx < 512){ - expected[idx] = val; - idx++; - } - } + int num_threads = 128; + int items_per_thread = 4; + for (int i = 0; i < num_threads; ++i) { + for (int j = 0; j < items_per_thread; ++j) { + expected[items_per_thread * i + j] = (i / sg_sz) * sg_sz * items_per_thread + sg_sz * j + i % sg_sz; + } } - } for (int i = 0; i < 512; ++i) { if (ptr[i] != expected[i]) { std::cout <<" failed\n"; @@ -193,12 +190,13 @@ bool test_load_blocked_striped_standalone() { return true; } else{ - int expected[512]; - for (int i = 0;i < 128; ++i){ - for(int j=0;j < 4; ++j){ - expected[i * 4 +j] = j * 128 +i; + int num_threads = 128; + int items_per_thread = 4; + for (int i = 0;i < num_threads; ++i){ + for(int j=0;j < items_per_thread; ++j){ + expected[i * items_per_thread +j] = j * num_threads +i; + } } - } for (int i = 0; i < 512; ++i) { if (ptr[i] != expected[i]) { std::cout <<" failed\n"; @@ -218,6 +216,6 @@ bool test_load_blocked_striped_standalone() { int main() { - return !(test_load_blocked_striped() && test_load_blocked_striped() && test_load_subgroup_striped_standalone() && + return !(test_group_load() && test_group_load() && test_load_subgroup_striped_standalone() && test_load_blocked_striped_standalone() && test_load_blocked_striped_standalone()); } From 7cd159cf900dbaa99a6b3ce0375727f6a815b53f Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 6 May 2024 08:53:28 +0530 Subject: [PATCH 21/36] refactor tests --- help_function/src/onedpl_test_group_load.cpp | 168 ++++++++----------- 1 file changed, 71 insertions(+), 97 deletions(-) diff --git a/help_function/src/onedpl_test_group_load.cpp b/help_function/src/onedpl_test_group_load.cpp index 6118b210f..82ba2992a 100644 --- a/help_function/src/onedpl_test_group_load.cpp +++ b/help_function/src/onedpl_test_group_load.cpp @@ -13,6 +13,72 @@ #include #include + +template +bool helper_validation_function(const int* ptr, const char * func_name){ + if ( T == dpct::group::load_algorithm::BLOCK_LOAD_DIRECT) + { + for (int i = 0; i < 512; ++i) { + if (ptr[i] != i) { + std::cout << func_name << "_blocked" <<" failed\n"; + std::ostream_iterator Iter(std::cout, ", "); + std::copy(ptr, ptr + 512, Iter); + std::cout << std::endl; + return false; + } + } + std::cout << func_name << "_blocked" <<" pass\n"; + return true; + } + + else{ + int expected[512]; + int num_threads = 128; + int items_per_thread = 4; + for (int i = 0;i < num_threads; ++i){ + for(int j=0;j < items_per_thread; ++j){ + expected[i * items_per_thread +j] = j * num_threads +i; + } + } + for (int i = 0; i < 512; ++i) { + if (ptr[i] != expected[i]) { + std::cout << func_name << "_striped" <<" failed\n"; + std::ostream_iterator Iter(std::cout, ", "); + std::copy(ptr, ptr + 512, Iter); + std::cout << std::endl; + return false; + } + } + + std::cout << func_name << "_striped" <<" pass\n"; + return true; + } + return false; +} + +bool subgroup_helper_validation_function(const int* ptr,const int &sg_sz, const char* func_name){ + int expected[512]; + int num_threads = 128; + int items_per_thread = 4; + for (int i = 0; i < num_threads; ++i) { + for (int j = 0; j < items_per_thread; ++j) { + expected[items_per_thread * i + j] = (i / sg_sz) * sg_sz * items_per_thread + sg_sz * j + i % sg_sz; + } + } + for (int i = 0; i < 512; ++i) { + if (ptr[i] != expected[i]) { + std::cout <<" failed\n"; + std::ostream_iterator Iter(std::cout, ", "); + std::copy(ptr, ptr + 512, Iter); + std::cout << std::endl; + return false; + } + } + + std::cout <<" pass\n"; + return true; +} + template bool test_group_load() { // Tests dpct::group::load_algorithm::BLOCK_LOAD_DIRECT & dpct::group::load_algorithm::BLOCK_LOAD_STRIPED @@ -46,44 +112,7 @@ bool test_group_load() { sycl::host_accessor data_accessor(buffer, sycl::read_only); const int *ptr = data_accessor.get_multi_ptr(); - - if ( T == dpct::group::load_algorithm::BLOCK_LOAD_DIRECT) - { - for (int i = 0; i < 512; ++i) { - if (ptr[i] != i) { - std::cout <<" failed\n"; - std::ostream_iterator Iter(std::cout, ", "); - std::copy(ptr, ptr + 512, Iter); - std::cout << std::endl; - return false; - } - } - - std::cout <<" pass\n"; - return true; - } - else{ - int expected[512]; - int num_threads = 128; - int items_per_thread = 4; - for (int i = 0;i < num_threads; ++i){ - for(int j=0;j < items_per_thread; ++j){ - expected[i * items_per_thread +j] = j * num_threads +i; - } - } - for (int i = 0; i < 512; ++i) { - if (ptr[i] != expected[i]) { - std::cout <<" failed\n"; - std::ostream_iterator Iter(std::cout, ", "); - std::copy(ptr, ptr + 512, Iter); - std::cout << std::endl; - return false; - } - } - - std::cout <<" pass\n"; - return true; - } + return helper_validation_function(ptr, "test_group_load"); } bool test_load_subgroup_striped_standalone() { @@ -120,30 +149,11 @@ bool test_load_subgroup_striped_standalone() { sycl::host_accessor data_accessor(buffer, sycl::read_only); const int *ptr = data_accessor.get_multi_ptr(); auto sg_sz = sg_sz_acc.get_host_access()[0]; - int expected[512]; - int num_threads = 128; - int items_per_thread = 4; - for (int i = 0; i < num_threads; ++i) { - for (int j = 0; j < items_per_thread; ++j) { - expected[items_per_thread * i + j] = (i / sg_sz) * sg_sz * items_per_thread + sg_sz * j + i % sg_sz; - } - } - for (int i = 0; i < 512; ++i) { - if (ptr[i] != expected[i]) { - std::cout <<" failed\n"; - std::ostream_iterator Iter(std::cout, ", "); - std::copy(ptr, ptr + 512, Iter); - std::cout << std::endl; - return false; - } - } - - std::cout <<" pass\n"; - return true; + return subgroup_helper_validation_function(ptr, sg_sz, "test_subgroup_striped_standalone"); } template -bool test_load_blocked_striped_standalone() { +bool test_group_load_standalone() { // Tests dpct::group::load_algorithm::BLOCK_LOAD_DIRECT & dpct::group::load_algorithm::BLOCK_LOAD_STRIPED // as standalone methods sycl::queue q; @@ -174,48 +184,12 @@ bool test_load_blocked_striped_standalone() { sycl::host_accessor data_accessor(buffer, sycl::read_only); const int *ptr = data_accessor.get_multi_ptr(); - if(T == dpct::group::load_algorithm::BLOCK_LOAD_DIRECT) - { - for (int i = 0; i < 512; ++i) { - if (ptr[i] != i) { - std::cout <<" failed\n"; - std::ostream_iterator Iter(std::cout, ", "); - std::copy(ptr, ptr + 512, Iter); - std::cout << std::endl; - return false; - } - } - - std::cout <<" pass\n"; - return true; - } - else{ - int num_threads = 128; - int items_per_thread = 4; - for (int i = 0;i < num_threads; ++i){ - for(int j=0;j < items_per_thread; ++j){ - expected[i * items_per_thread +j] = j * num_threads +i; - } - } - for (int i = 0; i < 512; ++i) { - if (ptr[i] != expected[i]) { - std::cout <<" failed\n"; - std::ostream_iterator Iter(std::cout, ", "); - std::copy(ptr, ptr + 512, Iter); - std::cout << std::endl; - return false; - } - } - - std::cout <<" pass\n"; - return true; - - } + return helper_validation_function(ptr, "test_group_load"); } int main() { return !(test_group_load() && test_group_load() && test_load_subgroup_striped_standalone() && - test_load_blocked_striped_standalone() && test_load_blocked_striped_standalone()); + test_group_load_standalone() && test_group_load_standalone()); } From 24e80cc7c4822f34a234c21eef297cd70be2c286 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 6 May 2024 18:50:32 +0530 Subject: [PATCH 22/36] update test --- help_function/src/onedpl_test_group_load.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/help_function/src/onedpl_test_group_load.cpp b/help_function/src/onedpl_test_group_load.cpp index 82ba2992a..c921e349b 100644 --- a/help_function/src/onedpl_test_group_load.cpp +++ b/help_function/src/onedpl_test_group_load.cpp @@ -16,7 +16,7 @@ template bool helper_validation_function(const int* ptr, const char * func_name){ - if ( T == dpct::group::load_algorithm::BLOCK_LOAD_DIRECT) + if constexpr ( T == dpct::group::load_algorithm::BLOCK_LOAD_DIRECT) { for (int i = 0; i < 512; ++i) { if (ptr[i] != i) { @@ -112,7 +112,7 @@ bool test_group_load() { sycl::host_accessor data_accessor(buffer, sycl::read_only); const int *ptr = data_accessor.get_multi_ptr(); - return helper_validation_function(ptr, "test_group_load"); + return helper_validation_function(ptr, "test_group_load"); } bool test_load_subgroup_striped_standalone() { @@ -184,7 +184,7 @@ bool test_group_load_standalone() { sycl::host_accessor data_accessor(buffer, sycl::read_only); const int *ptr = data_accessor.get_multi_ptr(); - return helper_validation_function(ptr, "test_group_load"); + return helper_validation_function(ptr, "test_group_load"); } From 0f4c9ee4e592d3c76f69f28b8b45fff51c0f51e5 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 6 May 2024 18:52:16 +0530 Subject: [PATCH 23/36] fix return no code exec --- help_function/src/onedpl_test_group_load.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/help_function/src/onedpl_test_group_load.cpp b/help_function/src/onedpl_test_group_load.cpp index c921e349b..91c5e48a3 100644 --- a/help_function/src/onedpl_test_group_load.cpp +++ b/help_function/src/onedpl_test_group_load.cpp @@ -28,7 +28,7 @@ bool helper_validation_function(const int* ptr, const char * func_name){ } } std::cout << func_name << "_blocked" <<" pass\n"; - return true; + } else{ @@ -51,9 +51,9 @@ bool helper_validation_function(const int* ptr, const char * func_name){ } std::cout << func_name << "_striped" <<" pass\n"; - return true; + } - return false; + return true; } bool subgroup_helper_validation_function(const int* ptr,const int &sg_sz, const char* func_name){ From 890d4ff8d8602fa60b3180dbbc1a2655f30cb604 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Tue, 7 May 2024 16:48:33 +0530 Subject: [PATCH 24/36] fix compile issues and pass test --- help_function/src/onedpl_test_group_load.cpp | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/help_function/src/onedpl_test_group_load.cpp b/help_function/src/onedpl_test_group_load.cpp index 91c5e48a3..93befe164 100644 --- a/help_function/src/onedpl_test_group_load.cpp +++ b/help_function/src/onedpl_test_group_load.cpp @@ -56,13 +56,14 @@ bool helper_validation_function(const int* ptr, const char * func_name){ return true; } -bool subgroup_helper_validation_function(const int* ptr,const int &sg_sz, const char* func_name){ +bool subgroup_helper_validation_function(const int* ptr,const uint32_t *sg_sz, const char* func_name){ int expected[512]; int num_threads = 128; int items_per_thread = 4; + uint32_t sg_sz_val = *sg_sz; for (int i = 0; i < num_threads; ++i) { for (int j = 0; j < items_per_thread; ++j) { - expected[items_per_thread * i + j] = (i / sg_sz) * sg_sz * items_per_thread + sg_sz * j + i % sg_sz; + expected[items_per_thread * i + j] = (i / sg_sz_val) * sg_sz_val * items_per_thread + sg_sz_val * j + i % sg_sz_val; } } for (int i = 0; i < 512; ++i) { @@ -122,22 +123,23 @@ bool test_load_subgroup_striped_standalone() { for (int i = 0; i < 512; i++) data[i] = i; sycl::buffer buffer(data, 512); - sycl::buffer sg_sz_buf(1); + sycl::buffer sg_sz_buf{sycl::range<1>(1)}; q.submit([&](sycl::handler &h) { sycl::accessor dacc(buffer, h, sycl::read_write); + sycl::accessor sg_sz_dacc(sg_sz_buf, h, sycl::read_write); h.parallel_for( sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)), [=](sycl::nd_item<3> item) { int thread_data[4]; auto *d = dacc.get_multi_ptr().get(); - auto sg_sz_acc = sg_sz_buf.get_access.(h); + auto *sg_sz_acc = sg_sz_dacc.get_multi_ptr().get(); size_t gid = item.get_global_linear_id(); if (gid == 0) { sg_sz_acc[0] = item.get_sub_group().get_local_linear_range(); } dpct::group::uninitialized_load_subgroup_striped<4, int>(item, d, thread_data); // Write thread_data of each work item at index to the global buffer - int global_index = item.get_group(2)*item.get_local_range().get(2) + item.get_local_id(2); // Each thread_data has 4 elements + int global_index = (item.get_group(2)*item.get_local_range().get(2)) + item.get_local_id(2); // Each thread_data has 4 elements #pragma unroll for (int i = 0; i < 4; ++i) { dacc[global_index * 4 + i] = thread_data[i]; @@ -148,8 +150,9 @@ bool test_load_subgroup_striped_standalone() { sycl::host_accessor data_accessor(buffer, sycl::read_only); const int *ptr = data_accessor.get_multi_ptr(); - auto sg_sz = sg_sz_acc.get_host_access()[0]; - return subgroup_helper_validation_function(ptr, sg_sz, "test_subgroup_striped_standalone"); + sycl::host_accessor data_accessor_sg(sg_sz_buf, sycl::read_only); + const uint32_t *ptr_sg = data_accessor_sg.get_multi_ptr(); + return subgroup_helper_validation_function(ptr, ptr_sg, "test_subgroup_striped_standalone"); } template @@ -173,7 +176,7 @@ bool test_group_load_standalone() { else {dpct::group::load_striped<4, int>(item, d, thread_data);} // Write thread_data of each work item at index to the global buffer - int global_index = int global_index = item.get_group(2)*item.get_local_range().get(2) + item.get_local_id(2); // Each thread_data has 4 elements + int global_index = item.get_group(2)*item.get_local_range().get(2) + item.get_local_id(2); // Each thread_data has 4 elements #pragma unroll for (int i = 0; i < 4; ++i) { dacc[global_index * 4 + i] = thread_data[i]; From 6ec45aa7f533739d760a8b459aab3a9af10414d7 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Tue, 7 May 2024 16:53:20 +0530 Subject: [PATCH 25/36] add comments --- help_function/src/onedpl_test_group_load.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/help_function/src/onedpl_test_group_load.cpp b/help_function/src/onedpl_test_group_load.cpp index 93befe164..0cb314bc2 100644 --- a/help_function/src/onedpl_test_group_load.cpp +++ b/help_function/src/onedpl_test_group_load.cpp @@ -193,6 +193,11 @@ bool test_group_load_standalone() { int main() { - return !(test_group_load() && test_group_load() && test_load_subgroup_striped_standalone() && - test_group_load_standalone() && test_group_load_standalone()); + return !( + // Calls test_group_load with blocked and striped strategies , should pass both results. + test_group_load() && test_group_load() && + // Calls test_load_subgroup_striped_standalone and should pass + test_load_subgroup_striped_standalone() && + // Calls test_group_load_standalone with blocked and striped strategies as free functions, should pass both results. + test_group_load_standalone() && test_group_load_standalone()); } From 3a5c119b0c8661f7fce55c53735eabef4cf88317 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Wed, 8 May 2024 22:29:46 +0530 Subject: [PATCH 26/36] apply barrier for sync and update test case --- help_function/src/onedpl_test_group_load.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/help_function/src/onedpl_test_group_load.cpp b/help_function/src/onedpl_test_group_load.cpp index 0cb314bc2..f7424429c 100644 --- a/help_function/src/onedpl_test_group_load.cpp +++ b/help_function/src/onedpl_test_group_load.cpp @@ -76,7 +76,7 @@ bool subgroup_helper_validation_function(const int* ptr,const uint32_t *sg_sz, c } } - std::cout <<" pass\n"; + std::cout << func_name <<" pass\n"; return true; } @@ -101,6 +101,7 @@ bool test_group_load() { auto *d = data_accessor.get_multi_ptr().get(); auto *tmp = tacc.get_multi_ptr().get(); group_load(tmp).load(item, d, thread_data); + item.barrier(sycl::access::fence_space::local_space); // Write thread_data of each work item at index to the global buffer int global_index = item.get_group(2)*item.get_local_range().get(2) + item.get_local_id(2); // Each thread_data has 4 elements #pragma unroll @@ -175,6 +176,7 @@ bool test_group_load_standalone() { {dpct::group::load_blocked<4, int>(item, d, thread_data);} else {dpct::group::load_striped<4, int>(item, d, thread_data);} + item.barrier(sycl::access::fence_space::local_space); // Write thread_data of each work item at index to the global buffer int global_index = item.get_group(2)*item.get_local_range().get(2) + item.get_local_id(2); // Each thread_data has 4 elements #pragma unroll From 8b4249421153eb3e722bf08ac0d9251fb185f747 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Thu, 9 May 2024 11:00:50 +0530 Subject: [PATCH 27/36] review comments --- help_function/src/onedpl_test_group_load.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/help_function/src/onedpl_test_group_load.cpp b/help_function/src/onedpl_test_group_load.cpp index f7424429c..8ec79d42c 100644 --- a/help_function/src/onedpl_test_group_load.cpp +++ b/help_function/src/onedpl_test_group_load.cpp @@ -1,4 +1,4 @@ -// ====------ onedpl_test_group_load.cpp------------ *- C++ -* ----===// +// ====------ util_group_load_test.cpp------------ *- C++ -* ----===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. @@ -84,7 +84,7 @@ template bool test_group_load() { // Tests dpct::group::load_algorithm::BLOCK_LOAD_DIRECT & dpct::group::load_algorithm::BLOCK_LOAD_STRIPED // in its entirety as API functions - sycl::queue q; + sycl::queue q(dpct::get_default_queue()); oneapi::dpl::counting_iterator count_it(0); sycl::buffer buffer(count_it, count_it + 512); @@ -119,7 +119,7 @@ bool test_group_load() { bool test_load_subgroup_striped_standalone() { // Tests dpct::group::load_subgroup_striped as standalone method - sycl::queue q; + sycl::queue q(dpct::get_default_queue()); int data[512]; for (int i = 0; i < 512; i++) data[i] = i; @@ -160,7 +160,7 @@ template bool test_group_load_standalone() { // Tests dpct::group::load_algorithm::BLOCK_LOAD_DIRECT & dpct::group::load_algorithm::BLOCK_LOAD_STRIPED // as standalone methods - sycl::queue q; + sycl::queue q(dpct::get_default_queue()); int data[512]; for (int i = 0; i < 512; i++) data[i] = i; From e79a7fc1f068c67d23ddd25bd1ce3656fc4a9389 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Thu, 9 May 2024 11:01:49 +0530 Subject: [PATCH 28/36] update --- help_function/help_function.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/help_function/help_function.xml b/help_function/help_function.xml index 712899081..ea78960cd 100644 --- a/help_function/help_function.xml +++ b/help_function/help_function.xml @@ -144,7 +144,7 @@ - + From 68618bc5449476b96bb28add6a67242f217f0691 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Thu, 9 May 2024 00:49:42 -0700 Subject: [PATCH 29/36] rename file --- .../src/{onedpl_test_group_load.cpp => util_group_load_test.cpp} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename help_function/src/{onedpl_test_group_load.cpp => util_group_load_test.cpp} (100%) diff --git a/help_function/src/onedpl_test_group_load.cpp b/help_function/src/util_group_load_test.cpp similarity index 100% rename from help_function/src/onedpl_test_group_load.cpp rename to help_function/src/util_group_load_test.cpp From 1905ed984e231f2b4c621298f441859d05f5cd72 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Thu, 9 May 2024 18:52:22 +0530 Subject: [PATCH 30/36] remove barrier and use output buffer --- help_function/src/util_group_load_test.cpp | 54 ++++++++++++++-------- 1 file changed, 34 insertions(+), 20 deletions(-) diff --git a/help_function/src/util_group_load_test.cpp b/help_function/src/util_group_load_test.cpp index 8ec79d42c..194df0334 100644 --- a/help_function/src/util_group_load_test.cpp +++ b/help_function/src/util_group_load_test.cpp @@ -87,32 +87,36 @@ bool test_group_load() { sycl::queue q(dpct::get_default_queue()); oneapi::dpl::counting_iterator count_it(0); sycl::buffer buffer(count_it, count_it + 512); + int data_out[512]; + for (int i = 0; i < 512; i++) data_out[i] = 0; + sycl::buffer buffer_out(data_out, 512); q.submit([&](sycl::handler &h) { using group_load = dpct::group::workgroup_load<4, T, int, int *, sycl::nd_item<3>>; size_t temp_storage_size = group_load::get_local_memory_size(128); sycl::local_accessor tacc(sycl::range<1>(temp_storage_size), h); - sycl::accessor data_accessor(buffer, h, sycl::read_write); - + sycl::accessor data_accessor_read(buffer, h, sycl::read_only); + sycl::accessor data_accessor_write(buffer_out, h, sycl::write_only); h.parallel_for( sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)), [=](sycl::nd_item<3> item) { int thread_data[4]; - auto *d = data_accessor.get_multi_ptr().get(); + auto *d_r = data_accessor_read.get_multi_ptr().get(); + auto *d_w = data_accessor_write.get_multi_ptr().get(); auto *tmp = tacc.get_multi_ptr().get(); - group_load(tmp).load(item, d, thread_data); - item.barrier(sycl::access::fence_space::local_space); + group_load(tmp).load(item, d_r, thread_data); + //item.barrier(sycl::access::fence_space::local_space); // Write thread_data of each work item at index to the global buffer int global_index = item.get_group(2)*item.get_local_range().get(2) + item.get_local_id(2); // Each thread_data has 4 elements #pragma unroll for (int i = 0; i < 4; ++i) { - data_accessor[global_index * 4 + i] = thread_data[i]; + data_accessor_write[global_index * 4 + i] = thread_data[i]; } }); }); q.wait_and_throw(); - sycl::host_accessor data_accessor(buffer, sycl::read_only); + sycl::host_accessor data_accessor(buffer_out, sycl::read_only); const int *ptr = data_accessor.get_multi_ptr(); return helper_validation_function(ptr, "test_group_load"); } @@ -122,34 +126,39 @@ bool test_load_subgroup_striped_standalone() { sycl::queue q(dpct::get_default_queue()); int data[512]; for (int i = 0; i < 512; i++) data[i] = i; - sycl::buffer buffer(data, 512); sycl::buffer sg_sz_buf{sycl::range<1>(1)}; + int data_out[512]; + for (int i = 0; i < 512; i++) data_out[i] = 0; + sycl::buffer buffer_out(data_out, 512); + q.submit([&](sycl::handler &h) { - sycl::accessor dacc(buffer, h, sycl::read_write); + sycl::accessor dacc_read(buffer, h, sycl::read_only); + sycl::accessor dacc_write(buffer_out, h, sycl::write_only); sycl::accessor sg_sz_dacc(sg_sz_buf, h, sycl::read_write); h.parallel_for( sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)), [=](sycl::nd_item<3> item) { int thread_data[4]; - auto *d = dacc.get_multi_ptr().get(); + auto *d_r = dacc_read.get_multi_ptr().get(); + auto *d_w = dacc_write.get_multi_ptr().get(); auto *sg_sz_acc = sg_sz_dacc.get_multi_ptr().get(); size_t gid = item.get_global_linear_id(); if (gid == 0) { sg_sz_acc[0] = item.get_sub_group().get_local_linear_range(); } - dpct::group::uninitialized_load_subgroup_striped<4, int>(item, d, thread_data); + dpct::group::uninitialized_load_subgroup_striped<4, int>(item, d_r, thread_data); // Write thread_data of each work item at index to the global buffer int global_index = (item.get_group(2)*item.get_local_range().get(2)) + item.get_local_id(2); // Each thread_data has 4 elements #pragma unroll for (int i = 0; i < 4; ++i) { - dacc[global_index * 4 + i] = thread_data[i]; + dacc_write[global_index * 4 + i] = thread_data[i]; } }); }); q.wait_and_throw(); - sycl::host_accessor data_accessor(buffer, sycl::read_only); + sycl::host_accessor data_accessor(buffer_out, sycl::read_only); const int *ptr = data_accessor.get_multi_ptr(); sycl::host_accessor data_accessor_sg(sg_sz_buf, sycl::read_only); const uint32_t *ptr_sg = data_accessor_sg.get_multi_ptr(); @@ -163,31 +172,36 @@ bool test_group_load_standalone() { sycl::queue q(dpct::get_default_queue()); int data[512]; for (int i = 0; i < 512; i++) data[i] = i; - sycl::buffer buffer(data, 512); + int data_out[512]; + for (int i = 0; i < 512; i++) data_out[i] = 0; + sycl::buffer buffer_out(data_out, 512); + q.submit([&](sycl::handler &h) { - sycl::accessor dacc(buffer, h, sycl::read_write); + sycl::accessor dacc_read(buffer, h, sycl::read_only); + sycl::accessor dacc_write(buffer_out, h, sycl::write_only); h.parallel_for( sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)), [=](sycl::nd_item<3> item) { int thread_data[4]; - auto *d = dacc.get_multi_ptr().get(); + auto *d_r = dacc_read.get_multi_ptr().get(); + auto *d_w = dacc_write.get_multi_ptr().get(); if( T == dpct::group::load_algorithm::BLOCK_LOAD_DIRECT) {dpct::group::load_blocked<4, int>(item, d, thread_data);} else - {dpct::group::load_striped<4, int>(item, d, thread_data);} - item.barrier(sycl::access::fence_space::local_space); + {dpct::group::load_striped<4, int>(item, d_r, thread_data);} + //item.barrier(sycl::access::fence_space::local_space); // Write thread_data of each work item at index to the global buffer int global_index = item.get_group(2)*item.get_local_range().get(2) + item.get_local_id(2); // Each thread_data has 4 elements #pragma unroll for (int i = 0; i < 4; ++i) { - dacc[global_index * 4 + i] = thread_data[i]; + dacc_w[global_index * 4 + i] = thread_data[i]; } }); }); q.wait_and_throw(); - sycl::host_accessor data_accessor(buffer, sycl::read_only); + sycl::host_accessor data_accessor(buffer_out, sycl::read_only); const int *ptr = data_accessor.get_multi_ptr(); return helper_validation_function(ptr, "test_group_load"); } From cf9fdd9c964c7ba49b104faf770639e421e7f6ea Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Thu, 9 May 2024 22:27:06 +0530 Subject: [PATCH 31/36] rm unused variables --- help_function/src/util_group_load_test.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/help_function/src/util_group_load_test.cpp b/help_function/src/util_group_load_test.cpp index 194df0334..ce501531f 100644 --- a/help_function/src/util_group_load_test.cpp +++ b/help_function/src/util_group_load_test.cpp @@ -102,10 +102,8 @@ bool test_group_load() { [=](sycl::nd_item<3> item) { int thread_data[4]; auto *d_r = data_accessor_read.get_multi_ptr().get(); - auto *d_w = data_accessor_write.get_multi_ptr().get(); auto *tmp = tacc.get_multi_ptr().get(); group_load(tmp).load(item, d_r, thread_data); - //item.barrier(sycl::access::fence_space::local_space); // Write thread_data of each work item at index to the global buffer int global_index = item.get_group(2)*item.get_local_range().get(2) + item.get_local_id(2); // Each thread_data has 4 elements #pragma unroll @@ -141,7 +139,6 @@ bool test_load_subgroup_striped_standalone() { [=](sycl::nd_item<3> item) { int thread_data[4]; auto *d_r = dacc_read.get_multi_ptr().get(); - auto *d_w = dacc_write.get_multi_ptr().get(); auto *sg_sz_acc = sg_sz_dacc.get_multi_ptr().get(); size_t gid = item.get_global_linear_id(); if (gid == 0) { @@ -185,12 +182,10 @@ bool test_group_load_standalone() { [=](sycl::nd_item<3> item) { int thread_data[4]; auto *d_r = dacc_read.get_multi_ptr().get(); - auto *d_w = dacc_write.get_multi_ptr().get(); if( T == dpct::group::load_algorithm::BLOCK_LOAD_DIRECT) {dpct::group::load_blocked<4, int>(item, d, thread_data);} else {dpct::group::load_striped<4, int>(item, d_r, thread_data);} - //item.barrier(sycl::access::fence_space::local_space); // Write thread_data of each work item at index to the global buffer int global_index = item.get_group(2)*item.get_local_range().get(2) + item.get_local_id(2); // Each thread_data has 4 elements #pragma unroll From 31382c1a680e9b7920034b7f11a9361c2110f926 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Thu, 9 May 2024 22:31:54 +0530 Subject: [PATCH 32/36] fix test case --- help_function/src/util_group_load_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/help_function/src/util_group_load_test.cpp b/help_function/src/util_group_load_test.cpp index ce501531f..9b171e8f1 100644 --- a/help_function/src/util_group_load_test.cpp +++ b/help_function/src/util_group_load_test.cpp @@ -190,7 +190,7 @@ bool test_group_load_standalone() { int global_index = item.get_group(2)*item.get_local_range().get(2) + item.get_local_id(2); // Each thread_data has 4 elements #pragma unroll for (int i = 0; i < 4; ++i) { - dacc_w[global_index * 4 + i] = thread_data[i]; + dacc_write[global_index * 4 + i] = thread_data[i]; } }); }); From 5a5881b386a4c5ad1eba1cf539ad0b215196be24 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Thu, 9 May 2024 23:02:38 +0530 Subject: [PATCH 33/36] add local changes and run tests --- help_function/src/util_group_load_test.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/help_function/src/util_group_load_test.cpp b/help_function/src/util_group_load_test.cpp index 9b171e8f1..c8cabdfd9 100644 --- a/help_function/src/util_group_load_test.cpp +++ b/help_function/src/util_group_load_test.cpp @@ -92,7 +92,7 @@ bool test_group_load() { sycl::buffer buffer_out(data_out, 512); q.submit([&](sycl::handler &h) { - using group_load = dpct::group::workgroup_load<4, T, int, int *, sycl::nd_item<3>>; + using group_load = dpct::group::workgroup_load<4, T, int, const int *, sycl::nd_item<3>>; size_t temp_storage_size = group_load::get_local_memory_size(128); sycl::local_accessor tacc(sycl::range<1>(temp_storage_size), h); sycl::accessor data_accessor_read(buffer, h, sycl::read_only); @@ -183,7 +183,7 @@ bool test_group_load_standalone() { int thread_data[4]; auto *d_r = dacc_read.get_multi_ptr().get(); if( T == dpct::group::load_algorithm::BLOCK_LOAD_DIRECT) - {dpct::group::load_blocked<4, int>(item, d, thread_data);} + {dpct::group::load_blocked<4, int>(item, d_r, thread_data);} else {dpct::group::load_striped<4, int>(item, d_r, thread_data);} // Write thread_data of each work item at index to the global buffer @@ -201,7 +201,6 @@ bool test_group_load_standalone() { return helper_validation_function(ptr, "test_group_load"); } - int main() { return !( From 7f6ed89c254048facd28734780cc7e37b8e1910f Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Fri, 10 May 2024 08:56:21 +0530 Subject: [PATCH 34/36] format changes --- help_function/src/util_group_load_test.cpp | 46 ++++++++++------------ 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/help_function/src/util_group_load_test.cpp b/help_function/src/util_group_load_test.cpp index c8cabdfd9..6f47b268e 100644 --- a/help_function/src/util_group_load_test.cpp +++ b/help_function/src/util_group_load_test.cpp @@ -13,11 +13,9 @@ #include #include - template bool helper_validation_function(const int* ptr, const char * func_name){ - if constexpr ( T == dpct::group::load_algorithm::BLOCK_LOAD_DIRECT) - { + if constexpr ( T == dpct::group::load_algorithm::BLOCK_LOAD_DIRECT){ for (int i = 0; i < 512; ++i) { if (ptr[i] != i) { std::cout << func_name << "_blocked" <<" failed\n"; @@ -27,31 +25,27 @@ bool helper_validation_function(const int* ptr, const char * func_name){ return false; } } - std::cout << func_name << "_blocked" <<" pass\n"; - + std::cout << func_name << "_blocked" <<" pass\n"; } - else{ - int expected[512]; - int num_threads = 128; - int items_per_thread = 4; - for (int i = 0;i < num_threads; ++i){ - for(int j=0;j < items_per_thread; ++j){ - expected[i * items_per_thread +j] = j * num_threads +i; - } - } - for (int i = 0; i < 512; ++i) { - if (ptr[i] != expected[i]) { - std::cout << func_name << "_striped" <<" failed\n"; - std::ostream_iterator Iter(std::cout, ", "); - std::copy(ptr, ptr + 512, Iter); - std::cout << std::endl; - return false; - } - } - - std::cout << func_name << "_striped" <<" pass\n"; - + int expected[512]; + int num_threads = 128; + int items_per_thread = 4; + for (int i = 0;i < num_threads; ++i){ + for(int j=0;j < items_per_thread; ++j){ + expected[i * items_per_thread +j] = j * num_threads +i; + } + } + for (int i = 0; i < 512; ++i) { + if (ptr[i] != expected[i]) { + std::cout << func_name << "_striped" <<" failed\n"; + std::ostream_iterator Iter(std::cout, ", "); + std::copy(ptr, ptr + 512, Iter); + std::cout << std::endl; + return false; + } + } + std::cout << func_name << "_striped" <<" pass\n"; } return true; } From 92650b8589dbd96a475f5861d4211d4b273ffbe1 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Thu, 9 May 2024 23:20:20 -0700 Subject: [PATCH 35/36] revert rename --- help_function/help_function.xml | 2 +- .../{util_group_load_test.cpp => onedpl_test_group_load.cpp} | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename help_function/src/{util_group_load_test.cpp => onedpl_test_group_load.cpp} (96%) diff --git a/help_function/help_function.xml b/help_function/help_function.xml index ea78960cd..712899081 100644 --- a/help_function/help_function.xml +++ b/help_function/help_function.xml @@ -144,7 +144,7 @@ - + diff --git a/help_function/src/util_group_load_test.cpp b/help_function/src/onedpl_test_group_load.cpp similarity index 96% rename from help_function/src/util_group_load_test.cpp rename to help_function/src/onedpl_test_group_load.cpp index 6f47b268e..3626cafec 100644 --- a/help_function/src/util_group_load_test.cpp +++ b/help_function/src/onedpl_test_group_load.cpp @@ -1,4 +1,4 @@ -// ====------ util_group_load_test.cpp------------ *- C++ -* ----===// +// ====------ onedpl_test_group_load.cpp------------ *- C++ -* ----===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. From 1a9d205be8df9ea336a32c2dba90a01f3ba7f7a2 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Thu, 9 May 2024 23:26:37 -0700 Subject: [PATCH 36/36] clang format --- help_function/src/onedpl_test_group_load.cpp | 206 +++++++++++-------- 1 file changed, 119 insertions(+), 87 deletions(-) diff --git a/help_function/src/onedpl_test_group_load.cpp b/help_function/src/onedpl_test_group_load.cpp index 3626cafec..b68e60efa 100644 --- a/help_function/src/onedpl_test_group_load.cpp +++ b/help_function/src/onedpl_test_group_load.cpp @@ -7,62 +7,68 @@ // // // ===----------------------------------------------------------------------===// -#include #include #include #include #include +#include -template -bool helper_validation_function(const int* ptr, const char * func_name){ - if constexpr ( T == dpct::group::load_algorithm::BLOCK_LOAD_DIRECT){ - for (int i = 0; i < 512; ++i) { +template +bool helper_validation_function(const int *ptr, const char *func_name) { + if constexpr (T == dpct::group::load_algorithm::BLOCK_LOAD_DIRECT) { + for (int i = 0; i < 512; ++i) { if (ptr[i] != i) { - std::cout << func_name << "_blocked" <<" failed\n"; + std::cout << func_name << "_blocked" + << " failed\n"; std::ostream_iterator Iter(std::cout, ", "); std::copy(ptr, ptr + 512, Iter); std::cout << std::endl; return false; - } - } - std::cout << func_name << "_blocked" <<" pass\n"; - } - else{ - int expected[512]; - int num_threads = 128; - int items_per_thread = 4; - for (int i = 0;i < num_threads; ++i){ - for(int j=0;j < items_per_thread; ++j){ - expected[i * items_per_thread +j] = j * num_threads +i; - } - } - for (int i = 0; i < 512; ++i) { - if (ptr[i] != expected[i]) { - std::cout << func_name << "_striped" <<" failed\n"; - std::ostream_iterator Iter(std::cout, ", "); - std::copy(ptr, ptr + 512, Iter); - std::cout << std::endl; - return false; - } - } - std::cout << func_name << "_striped" <<" pass\n"; + } + } + std::cout << func_name << "_blocked" + << " pass\n"; + } else { + int expected[512]; + int num_threads = 128; + int items_per_thread = 4; + for (int i = 0; i < num_threads; ++i) { + for (int j = 0; j < items_per_thread; ++j) { + expected[i * items_per_thread + j] = j * num_threads + i; + } + } + for (int i = 0; i < 512; ++i) { + if (ptr[i] != expected[i]) { + std::cout << func_name << "_striped" + << " failed\n"; + std::ostream_iterator Iter(std::cout, ", "); + std::copy(ptr, ptr + 512, Iter); + std::cout << std::endl; + return false; + } + } + std::cout << func_name << "_striped" + << " pass\n"; } return true; } -bool subgroup_helper_validation_function(const int* ptr,const uint32_t *sg_sz, const char* func_name){ +bool subgroup_helper_validation_function(const int *ptr, const uint32_t *sg_sz, + const char *func_name) { int expected[512]; int num_threads = 128; int items_per_thread = 4; uint32_t sg_sz_val = *sg_sz; for (int i = 0; i < num_threads; ++i) { - for (int j = 0; j < items_per_thread; ++j) { - expected[items_per_thread * i + j] = (i / sg_sz_val) * sg_sz_val * items_per_thread + sg_sz_val * j + i % sg_sz_val; - } - } + for (int j = 0; j < items_per_thread; ++j) { + expected[items_per_thread * i + j] = + (i / sg_sz_val) * sg_sz_val * items_per_thread + sg_sz_val * j + + i % sg_sz_val; + } + } for (int i = 0; i < 512; ++i) { if (ptr[i] != expected[i]) { - std::cout <<" failed\n"; + std::cout << " failed\n"; std::ostream_iterator Iter(std::cout, ", "); std::copy(ptr, ptr + 512, Iter); std::cout << std::endl; @@ -70,23 +76,25 @@ bool subgroup_helper_validation_function(const int* ptr,const uint32_t *sg_sz, c } } - std::cout << func_name <<" pass\n"; + std::cout << func_name << " pass\n"; return true; } -template -bool test_group_load() { - // Tests dpct::group::load_algorithm::BLOCK_LOAD_DIRECT & dpct::group::load_algorithm::BLOCK_LOAD_STRIPED - // in its entirety as API functions +template bool test_group_load() { + // Tests dpct::group::load_algorithm::BLOCK_LOAD_DIRECT & + // dpct::group::load_algorithm::BLOCK_LOAD_STRIPED in its entirety as API + // functions sycl::queue q(dpct::get_default_queue()); oneapi::dpl::counting_iterator count_it(0); sycl::buffer buffer(count_it, count_it + 512); int data_out[512]; - for (int i = 0; i < 512; i++) data_out[i] = 0; + for (int i = 0; i < 512; i++) + data_out[i] = 0; sycl::buffer buffer_out(data_out, 512); - + q.submit([&](sycl::handler &h) { - using group_load = dpct::group::workgroup_load<4, T, int, const int *, sycl::nd_item<3>>; + using group_load = + dpct::group::workgroup_load<4, T, int, const int *, sycl::nd_item<3>>; size_t temp_storage_size = group_load::get_local_memory_size(128); sycl::local_accessor tacc(sycl::range<1>(temp_storage_size), h); sycl::accessor data_accessor_read(buffer, h, sycl::read_only); @@ -95,19 +103,23 @@ bool test_group_load() { sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)), [=](sycl::nd_item<3> item) { int thread_data[4]; - auto *d_r = data_accessor_read.get_multi_ptr().get(); + auto *d_r = + data_accessor_read.get_multi_ptr() + .get(); auto *tmp = tacc.get_multi_ptr().get(); group_load(tmp).load(item, d_r, thread_data); // Write thread_data of each work item at index to the global buffer - int global_index = item.get_group(2)*item.get_local_range().get(2) + item.get_local_id(2); // Each thread_data has 4 elements - #pragma unroll - for (int i = 0; i < 4; ++i) { - data_accessor_write[global_index * 4 + i] = thread_data[i]; - } + int global_index = + item.get_group(2) * item.get_local_range().get(2) + + item.get_local_id(2); // Each thread_data has 4 elements +#pragma unroll + for (int i = 0; i < 4; ++i) { + data_accessor_write[global_index * 4 + i] = thread_data[i]; + } }); }); q.wait_and_throw(); - + sycl::host_accessor data_accessor(buffer_out, sycl::read_only); const int *ptr = data_accessor.get_multi_ptr(); return helper_validation_function(ptr, "test_group_load"); @@ -117,13 +129,15 @@ bool test_load_subgroup_striped_standalone() { // Tests dpct::group::load_subgroup_striped as standalone method sycl::queue q(dpct::get_default_queue()); int data[512]; - for (int i = 0; i < 512; i++) data[i] = i; + for (int i = 0; i < 512; i++) + data[i] = i; sycl::buffer buffer(data, 512); sycl::buffer sg_sz_buf{sycl::range<1>(1)}; int data_out[512]; - for (int i = 0; i < 512; i++) data_out[i] = 0; + for (int i = 0; i < 512; i++) + data_out[i] = 0; sycl::buffer buffer_out(data_out, 512); - + q.submit([&](sycl::handler &h) { sycl::accessor dacc_read(buffer, h, sycl::read_only); sycl::accessor dacc_write(buffer_out, h, sycl::write_only); @@ -132,19 +146,24 @@ bool test_load_subgroup_striped_standalone() { sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)), [=](sycl::nd_item<3> item) { int thread_data[4]; - auto *d_r = dacc_read.get_multi_ptr().get(); - auto *sg_sz_acc = sg_sz_dacc.get_multi_ptr().get(); + auto *d_r = + dacc_read.get_multi_ptr().get(); + auto *sg_sz_acc = + sg_sz_dacc.get_multi_ptr().get(); size_t gid = item.get_global_linear_id(); if (gid == 0) { - sg_sz_acc[0] = item.get_sub_group().get_local_linear_range(); + sg_sz_acc[0] = item.get_sub_group().get_local_linear_range(); } - dpct::group::uninitialized_load_subgroup_striped<4, int>(item, d_r, thread_data); + dpct::group::uninitialized_load_subgroup_striped<4, int>(item, d_r, + thread_data); // Write thread_data of each work item at index to the global buffer - int global_index = (item.get_group(2)*item.get_local_range().get(2)) + item.get_local_id(2); // Each thread_data has 4 elements - #pragma unroll + int global_index = + (item.get_group(2) * item.get_local_range().get(2)) + + item.get_local_id(2); // Each thread_data has 4 elements +#pragma unroll for (int i = 0; i < 4; ++i) { - dacc_write[global_index * 4 + i] = thread_data[i]; - } + dacc_write[global_index * 4 + i] = thread_data[i]; + } }); }); q.wait_and_throw(); @@ -152,22 +171,25 @@ bool test_load_subgroup_striped_standalone() { sycl::host_accessor data_accessor(buffer_out, sycl::read_only); const int *ptr = data_accessor.get_multi_ptr(); sycl::host_accessor data_accessor_sg(sg_sz_buf, sycl::read_only); - const uint32_t *ptr_sg = data_accessor_sg.get_multi_ptr(); - return subgroup_helper_validation_function(ptr, ptr_sg, "test_subgroup_striped_standalone"); + const uint32_t *ptr_sg = + data_accessor_sg.get_multi_ptr(); + return subgroup_helper_validation_function( + ptr, ptr_sg, "test_subgroup_striped_standalone"); } -template -bool test_group_load_standalone() { - // Tests dpct::group::load_algorithm::BLOCK_LOAD_DIRECT & dpct::group::load_algorithm::BLOCK_LOAD_STRIPED - // as standalone methods +template bool test_group_load_standalone() { + // Tests dpct::group::load_algorithm::BLOCK_LOAD_DIRECT & + // dpct::group::load_algorithm::BLOCK_LOAD_STRIPED as standalone methods sycl::queue q(dpct::get_default_queue()); int data[512]; - for (int i = 0; i < 512; i++) data[i] = i; + for (int i = 0; i < 512; i++) + data[i] = i; sycl::buffer buffer(data, 512); int data_out[512]; - for (int i = 0; i < 512; i++) data_out[i] = 0; + for (int i = 0; i < 512; i++) + data_out[i] = 0; sycl::buffer buffer_out(data_out, 512); - + q.submit([&](sycl::handler &h) { sycl::accessor dacc_read(buffer, h, sycl::read_only); sycl::accessor dacc_write(buffer_out, h, sycl::write_only); @@ -175,17 +197,21 @@ bool test_group_load_standalone() { sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)), [=](sycl::nd_item<3> item) { int thread_data[4]; - auto *d_r = dacc_read.get_multi_ptr().get(); - if( T == dpct::group::load_algorithm::BLOCK_LOAD_DIRECT) - {dpct::group::load_blocked<4, int>(item, d_r, thread_data);} - else - {dpct::group::load_striped<4, int>(item, d_r, thread_data);} + auto *d_r = + dacc_read.get_multi_ptr().get(); + if (T == dpct::group::load_algorithm::BLOCK_LOAD_DIRECT) { + dpct::group::load_blocked<4, int>(item, d_r, thread_data); + } else { + dpct::group::load_striped<4, int>(item, d_r, thread_data); + } // Write thread_data of each work item at index to the global buffer - int global_index = item.get_group(2)*item.get_local_range().get(2) + item.get_local_id(2); // Each thread_data has 4 elements - #pragma unroll - for (int i = 0; i < 4; ++i) { - dacc_write[global_index * 4 + i] = thread_data[i]; - } + int global_index = + item.get_group(2) * item.get_local_range().get(2) + + item.get_local_id(2); // Each thread_data has 4 elements +#pragma unroll + for (int i = 0; i < 4; ++i) { + dacc_write[global_index * 4 + i] = thread_data[i]; + } }); }); q.wait_and_throw(); @@ -196,12 +222,18 @@ bool test_group_load_standalone() { } int main() { - + return !( - // Calls test_group_load with blocked and striped strategies , should pass both results. - test_group_load() && test_group_load() && - // Calls test_load_subgroup_striped_standalone and should pass - test_load_subgroup_striped_standalone() && - // Calls test_group_load_standalone with blocked and striped strategies as free functions, should pass both results. - test_group_load_standalone() && test_group_load_standalone()); + // Calls test_group_load with blocked and striped strategies , should pass + // both results. + test_group_load() && + test_group_load() && + // Calls test_load_subgroup_striped_standalone and should pass + test_load_subgroup_striped_standalone() && + // Calls test_group_load_standalone with blocked and striped strategies as + // free functions, should pass both results. + test_group_load_standalone< + dpct::group::load_algorithm::BLOCK_LOAD_STRIPED>() && + test_group_load_standalone< + dpct::group::load_algorithm::BLOCK_LOAD_DIRECT>()); }