-
Notifications
You must be signed in to change notification settings - Fork 32
[SYClomatic-test] Add help function for Block Load header #619
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 35 commits
c08ae48
513b103
d8b7cb7
be5eefe
66ec770
146c0d1
ec5d9a8
400f48d
5878cb7
022e628
ae0e1d6
d2ae0ee
23b7023
7c2ff7e
fe51693
c5447c0
d99a828
5ddb779
ab0277a
300ac08
9c21a36
7cd159c
2b54c4c
24e80cc
0f4c9ee
890d4ff
6ec45aa
3a5c119
8b42494
e79a7fc
68618bc
1905ed9
cf9fdd9
31382c1
5a5881b
7f6ed89
92650b8
1a9d205
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,213 @@ | ||
// ====------ util_group_load_test.cpp------------ *- C++ -* ----===// | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we keep the filename same as other onedpl test? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the other similar filenames for sort etc should also change when being migrated to the group_util.hpp file. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This shouldn't be done in this PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not testing oneDPL or oneDPL-like functionality. It is currently placed in the dpcpp extensions header with the intention to move it to a new header which is targeted at sycl compat. This code tests a helper function called from within a kernel, which is a lower level than the rest of oneDPL or oneDPL compatibility headers. If we must have the name as onedpl test until the code is moved, that is fine, but it is just more that needs to be changed later. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes noted, will update the file paths when writing linking tests for oneapi-src/SYCLomatic#1784 . |
||
|
||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
// | ||
// ===----------------------------------------------------------------------===// | ||
#include <sycl/sycl.hpp> | ||
#include <dpct/dpct.hpp> | ||
#include <dpct/dpl_utils.hpp> | ||
#include <iostream> | ||
#include <oneapi/dpl/iterator> | ||
|
||
|
||
template<dpct::group::load_algorithm T> | ||
bool helper_validation_function(const int* ptr, const char * func_name){ | ||
yihanwg marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if constexpr ( T == dpct::group::load_algorithm::BLOCK_LOAD_DIRECT) | ||
{ | ||
yihanwg marked this conversation as resolved.
Show resolved
Hide resolved
|
||
for (int i = 0; i < 512; ++i) { | ||
if (ptr[i] != i) { | ||
std::cout << func_name << "_blocked" <<" failed\n"; | ||
std::ostream_iterator<int> Iter(std::cout, ", "); | ||
std::copy(ptr, ptr + 512, Iter); | ||
std::cout << std::endl; | ||
return false; | ||
} | ||
} | ||
std::cout << func_name << "_blocked" <<" pass\n"; | ||
|
||
} | ||
|
||
else{ | ||
yihanwg marked this conversation as resolved.
Show resolved
Hide resolved
|
||
int expected[512]; | ||
int num_threads = 128; | ||
int items_per_thread = 4; | ||
for (int i = 0;i < num_threads; ++i){ | ||
for(int j=0;j < items_per_thread; ++j){ | ||
expected[i * items_per_thread +j] = j * num_threads +i; | ||
} | ||
} | ||
for (int i = 0; i < 512; ++i) { | ||
if (ptr[i] != expected[i]) { | ||
std::cout << func_name << "_striped" <<" failed\n"; | ||
std::ostream_iterator<int> Iter(std::cout, ", "); | ||
std::copy(ptr, ptr + 512, Iter); | ||
std::cout << std::endl; | ||
return false; | ||
} | ||
} | ||
|
||
std::cout << func_name << "_striped" <<" pass\n"; | ||
|
||
} | ||
return true; | ||
} | ||
|
||
bool subgroup_helper_validation_function(const int* ptr,const uint32_t *sg_sz, const char* func_name){ | ||
int expected[512]; | ||
int num_threads = 128; | ||
int items_per_thread = 4; | ||
uint32_t sg_sz_val = *sg_sz; | ||
for (int i = 0; i < num_threads; ++i) { | ||
for (int j = 0; j < items_per_thread; ++j) { | ||
expected[items_per_thread * i + j] = (i / sg_sz_val) * sg_sz_val * items_per_thread + sg_sz_val * j + i % sg_sz_val; | ||
} | ||
} | ||
for (int i = 0; i < 512; ++i) { | ||
if (ptr[i] != expected[i]) { | ||
std::cout <<" failed\n"; | ||
std::ostream_iterator<int> Iter(std::cout, ", "); | ||
std::copy(ptr, ptr + 512, Iter); | ||
std::cout << std::endl; | ||
return false; | ||
} | ||
} | ||
|
||
std::cout << func_name <<" pass\n"; | ||
return true; | ||
} | ||
|
||
template<dpct::group::load_algorithm T> | ||
bool test_group_load() { | ||
// Tests dpct::group::load_algorithm::BLOCK_LOAD_DIRECT & dpct::group::load_algorithm::BLOCK_LOAD_STRIPED | ||
// in its entirety as API functions | ||
sycl::queue q(dpct::get_default_queue()); | ||
oneapi::dpl::counting_iterator<int> count_it(0); | ||
sycl::buffer<int, 1> buffer(count_it, count_it + 512); | ||
int data_out[512]; | ||
for (int i = 0; i < 512; i++) data_out[i] = 0; | ||
sycl::buffer<int, 1> buffer_out(data_out, 512); | ||
|
||
q.submit([&](sycl::handler &h) { | ||
using group_load = dpct::group::workgroup_load<4, T, int, const int *, sycl::nd_item<3>>; | ||
size_t temp_storage_size = group_load::get_local_memory_size(128); | ||
sycl::local_accessor<uint8_t, 1> tacc(sycl::range<1>(temp_storage_size), h); | ||
sycl::accessor data_accessor_read(buffer, h, sycl::read_only); | ||
sycl::accessor data_accessor_write(buffer_out, h, sycl::write_only); | ||
h.parallel_for( | ||
sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)), | ||
[=](sycl::nd_item<3> item) { | ||
int thread_data[4]; | ||
auto *d_r = data_accessor_read.get_multi_ptr<sycl::access::decorated::yes>().get(); | ||
auto *tmp = tacc.get_multi_ptr<sycl::access::decorated::yes>().get(); | ||
group_load(tmp).load(item, d_r, thread_data); | ||
// Write thread_data of each work item at index to the global buffer | ||
int global_index = item.get_group(2)*item.get_local_range().get(2) + item.get_local_id(2); // Each thread_data has 4 elements | ||
#pragma unroll | ||
for (int i = 0; i < 4; ++i) { | ||
data_accessor_write[global_index * 4 + i] = thread_data[i]; | ||
} | ||
}); | ||
}); | ||
q.wait_and_throw(); | ||
|
||
sycl::host_accessor data_accessor(buffer_out, sycl::read_only); | ||
const int *ptr = data_accessor.get_multi_ptr<sycl::access::decorated::yes>(); | ||
return helper_validation_function<T>(ptr, "test_group_load"); | ||
} | ||
|
||
bool test_load_subgroup_striped_standalone() { | ||
// Tests dpct::group::load_subgroup_striped as standalone method | ||
sycl::queue q(dpct::get_default_queue()); | ||
int data[512]; | ||
for (int i = 0; i < 512; i++) data[i] = i; | ||
sycl::buffer<int, 1> buffer(data, 512); | ||
sycl::buffer<uint32_t, 1> sg_sz_buf{sycl::range<1>(1)}; | ||
int data_out[512]; | ||
for (int i = 0; i < 512; i++) data_out[i] = 0; | ||
sycl::buffer<int, 1> buffer_out(data_out, 512); | ||
|
||
q.submit([&](sycl::handler &h) { | ||
sycl::accessor dacc_read(buffer, h, sycl::read_only); | ||
sycl::accessor dacc_write(buffer_out, h, sycl::write_only); | ||
sycl::accessor sg_sz_dacc(sg_sz_buf, h, sycl::read_write); | ||
h.parallel_for( | ||
sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)), | ||
[=](sycl::nd_item<3> item) { | ||
int thread_data[4]; | ||
auto *d_r = dacc_read.get_multi_ptr<sycl::access::decorated::yes>().get(); | ||
auto *sg_sz_acc = sg_sz_dacc.get_multi_ptr<sycl::access::decorated::yes>().get(); | ||
size_t gid = item.get_global_linear_id(); | ||
if (gid == 0) { | ||
sg_sz_acc[0] = item.get_sub_group().get_local_linear_range(); | ||
} | ||
dpct::group::uninitialized_load_subgroup_striped<4, int>(item, d_r, thread_data); | ||
// Write thread_data of each work item at index to the global buffer | ||
int global_index = (item.get_group(2)*item.get_local_range().get(2)) + item.get_local_id(2); // Each thread_data has 4 elements | ||
#pragma unroll | ||
for (int i = 0; i < 4; ++i) { | ||
dacc_write[global_index * 4 + i] = thread_data[i]; | ||
} | ||
}); | ||
}); | ||
q.wait_and_throw(); | ||
|
||
sycl::host_accessor data_accessor(buffer_out, sycl::read_only); | ||
const int *ptr = data_accessor.get_multi_ptr<sycl::access::decorated::yes>(); | ||
sycl::host_accessor data_accessor_sg(sg_sz_buf, sycl::read_only); | ||
const uint32_t *ptr_sg = data_accessor_sg.get_multi_ptr<sycl::access::decorated::yes>(); | ||
return subgroup_helper_validation_function(ptr, ptr_sg, "test_subgroup_striped_standalone"); | ||
} | ||
|
||
template<dpct::group::load_algorithm T> | ||
bool test_group_load_standalone() { | ||
// Tests dpct::group::load_algorithm::BLOCK_LOAD_DIRECT & dpct::group::load_algorithm::BLOCK_LOAD_STRIPED | ||
// as standalone methods | ||
sycl::queue q(dpct::get_default_queue()); | ||
int data[512]; | ||
for (int i = 0; i < 512; i++) data[i] = i; | ||
sycl::buffer<int, 1> buffer(data, 512); | ||
int data_out[512]; | ||
for (int i = 0; i < 512; i++) data_out[i] = 0; | ||
sycl::buffer<int, 1> buffer_out(data_out, 512); | ||
|
||
q.submit([&](sycl::handler &h) { | ||
sycl::accessor dacc_read(buffer, h, sycl::read_only); | ||
sycl::accessor dacc_write(buffer_out, h, sycl::write_only); | ||
h.parallel_for( | ||
sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)), | ||
[=](sycl::nd_item<3> item) { | ||
int thread_data[4]; | ||
auto *d_r = dacc_read.get_multi_ptr<sycl::access::decorated::yes>().get(); | ||
if( T == dpct::group::load_algorithm::BLOCK_LOAD_DIRECT) | ||
{dpct::group::load_blocked<4, int>(item, d_r, thread_data);} | ||
else | ||
{dpct::group::load_striped<4, int>(item, d_r, thread_data);} | ||
// Write thread_data of each work item at index to the global buffer | ||
int global_index = item.get_group(2)*item.get_local_range().get(2) + item.get_local_id(2); // Each thread_data has 4 elements | ||
#pragma unroll | ||
for (int i = 0; i < 4; ++i) { | ||
dacc_write[global_index * 4 + i] = thread_data[i]; | ||
} | ||
}); | ||
}); | ||
q.wait_and_throw(); | ||
|
||
sycl::host_accessor data_accessor(buffer_out, sycl::read_only); | ||
const int *ptr = data_accessor.get_multi_ptr<sycl::access::decorated::yes>(); | ||
return helper_validation_function<T>(ptr, "test_group_load"); | ||
} | ||
|
||
int main() { | ||
|
||
return !( | ||
// Calls test_group_load with blocked and striped strategies , should pass both results. | ||
test_group_load<dpct::group::load_algorithm::BLOCK_LOAD_DIRECT>() && test_group_load<dpct::group::load_algorithm::BLOCK_LOAD_STRIPED>() && | ||
// Calls test_load_subgroup_striped_standalone and should pass | ||
test_load_subgroup_striped_standalone() && | ||
// Calls test_group_load_standalone with blocked and striped strategies as free functions, should pass both results. | ||
test_group_load_standalone<dpct::group::load_algorithm::BLOCK_LOAD_STRIPED>() && test_group_load_standalone<dpct::group::load_algorithm::BLOCK_LOAD_DIRECT>()); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The test name should be
onedpl_test_group_load
.