Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
c08ae48
block load tests
abhilash1910 Feb 1, 2024
513b103
update test
abhilash1910 Apr 3, 2024
d8b7cb7
use onedpl iterator
abhilash1910 Apr 4, 2024
be5eefe
rm duplicate function & iterator
abhilash1910 Apr 5, 2024
66ec770
Update help_function/src/onedpl_test_group_load.cpp
abhilash1910 Apr 9, 2024
146c0d1
Update help_function/src/onedpl_test_group_load.cpp
abhilash1910 Apr 9, 2024
ec5d9a8
Update help_function/src/onedpl_test_group_load.cpp
abhilash1910 Apr 9, 2024
400f48d
rm unwanted test
abhilash1910 Apr 11, 2024
5878cb7
update review comments
abhilash1910 Apr 11, 2024
022e628
Merge branch 'SYCLomatic' into block_load_test
abhilash1910 Apr 11, 2024
ae0e1d6
rebase from upstream
abhilash1910 Apr 11, 2024
d2ae0ee
update tests
abhilash1910 Apr 29, 2024
23b7023
update template param
abhilash1910 Apr 30, 2024
7c2ff7e
fix compile issues
abhilash1910 Apr 30, 2024
fe51693
update & fix all tests
abhilash1910 Apr 30, 2024
c5447c0
revert accidental deletion
abhilash1910 Apr 30, 2024
d99a828
replace striped expected output
abhilash1910 May 1, 2024
5ddb779
update striped test case
abhilash1910 May 2, 2024
ab0277a
use subgroup size
abhilash1910 May 2, 2024
300ac08
re-add to prev line
abhilash1910 May 5, 2024
9c21a36
review commits
abhilash1910 May 6, 2024
7cd159c
refactor tests
abhilash1910 May 6, 2024
2b54c4c
Merge branch 'SYCLomatic' into block_load_test
abhilash1910 May 6, 2024
24e80cc
update test
abhilash1910 May 6, 2024
0f4c9ee
fix return no code exec
abhilash1910 May 6, 2024
890d4ff
fix compile issues and pass test
abhilash1910 May 7, 2024
6ec45aa
add comments
abhilash1910 May 7, 2024
3a5c119
apply barrier for sync and update test case
abhilash1910 May 8, 2024
8b42494
review comments
abhilash1910 May 9, 2024
e79a7fc
update
abhilash1910 May 9, 2024
68618bc
rename file
abhilash1910 May 9, 2024
1905ed9
remove barrier and use output buffer
abhilash1910 May 9, 2024
cf9fdd9
rm unused variables
abhilash1910 May 9, 2024
31382c1
fix test case
abhilash1910 May 9, 2024
5a5881b
add local changes and run tests
abhilash1910 May 9, 2024
7f6ed89
format changes
abhilash1910 May 10, 2024
92650b8
revert rename
abhilash1910 May 10, 2024
1a9d205
clang format
abhilash1910 May 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions help_function/help_function.xml
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@
<!-- <test testName="onedpl_test_transform" configFile="config/TEMPLATE_help_function_skip_cuda_backend.xml" /> -->
<test testName="onedpl_test_transform_output_iterator" configFile="config/TEMPLATE_help_function_skip_cuda_backend.xml" />
<test testName="onedpl_test_group_sort" configFile="config/TEMPLATE_help_function.xml" />
<test testName="onedpl_test_group_load" configFile="config/TEMPLATE_help_function.xml" />
<test testName="onedpl_test_transform_reduce" configFile="config/TEMPLATE_help_function_skip_double.xml" splitGroup="double" />
<test testName="onedpl_test_translate_key" configFile="config/TEMPLATE_help_function_skip_double.xml" splitGroup="double" />
<test testName="onedpl_test_uninitialized_fill" configFile="config/TEMPLATE_help_function_skip_cuda_backend.xml" />
Expand Down
239 changes: 239 additions & 0 deletions help_function/src/onedpl_test_group_load.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
// ====------ onedpl_test_group_load.cpp------------ *- C++ -* ----===//

//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//
// ===----------------------------------------------------------------------===//
#include <dpct/dpct.hpp>
#include <dpct/dpl_utils.hpp>
#include <iostream>
#include <oneapi/dpl/iterator>
#include <sycl/sycl.hpp>

template <dpct::group::load_algorithm T>
bool helper_validation_function(const int *ptr, const char *func_name) {
if constexpr (T == dpct::group::load_algorithm::BLOCK_LOAD_DIRECT) {
for (int i = 0; i < 512; ++i) {
if (ptr[i] != i) {
std::cout << func_name << "_blocked"
<< " failed\n";
std::ostream_iterator<int> Iter(std::cout, ", ");
std::copy(ptr, ptr + 512, Iter);
std::cout << std::endl;
return false;
}
}
std::cout << func_name << "_blocked"
<< " pass\n";
} else {
int expected[512];
int num_threads = 128;
int items_per_thread = 4;
for (int i = 0; i < num_threads; ++i) {
for (int j = 0; j < items_per_thread; ++j) {
expected[i * items_per_thread + j] = j * num_threads + i;
}
}
for (int i = 0; i < 512; ++i) {
if (ptr[i] != expected[i]) {
std::cout << func_name << "_striped"
<< " failed\n";
std::ostream_iterator<int> Iter(std::cout, ", ");
std::copy(ptr, ptr + 512, Iter);
std::cout << std::endl;
return false;
}
}
std::cout << func_name << "_striped"
<< " pass\n";
}
return true;
}

bool subgroup_helper_validation_function(const int *ptr, const uint32_t *sg_sz,
const char *func_name) {
int expected[512];
int num_threads = 128;
int items_per_thread = 4;
uint32_t sg_sz_val = *sg_sz;
for (int i = 0; i < num_threads; ++i) {
for (int j = 0; j < items_per_thread; ++j) {
expected[items_per_thread * i + j] =
(i / sg_sz_val) * sg_sz_val * items_per_thread + sg_sz_val * j +
i % sg_sz_val;
}
}
for (int i = 0; i < 512; ++i) {
if (ptr[i] != expected[i]) {
std::cout << " failed\n";
std::ostream_iterator<int> Iter(std::cout, ", ");
std::copy(ptr, ptr + 512, Iter);
std::cout << std::endl;
return false;
}
}

std::cout << func_name << " pass\n";
return true;
}

template <dpct::group::load_algorithm T> bool test_group_load() {
// Tests dpct::group::load_algorithm::BLOCK_LOAD_DIRECT &
// dpct::group::load_algorithm::BLOCK_LOAD_STRIPED in its entirety as API
// functions
sycl::queue q(dpct::get_default_queue());
oneapi::dpl::counting_iterator<int> count_it(0);
sycl::buffer<int, 1> buffer(count_it, count_it + 512);
int data_out[512];
for (int i = 0; i < 512; i++)
data_out[i] = 0;
sycl::buffer<int, 1> buffer_out(data_out, 512);

q.submit([&](sycl::handler &h) {
using group_load =
dpct::group::workgroup_load<4, T, int, const int *, sycl::nd_item<3>>;
size_t temp_storage_size = group_load::get_local_memory_size(128);
sycl::local_accessor<uint8_t, 1> tacc(sycl::range<1>(temp_storage_size), h);
sycl::accessor data_accessor_read(buffer, h, sycl::read_only);
sycl::accessor data_accessor_write(buffer_out, h, sycl::write_only);
h.parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
[=](sycl::nd_item<3> item) {
int thread_data[4];
auto *d_r =
data_accessor_read.get_multi_ptr<sycl::access::decorated::yes>()
.get();
auto *tmp = tacc.get_multi_ptr<sycl::access::decorated::yes>().get();
group_load(tmp).load(item, d_r, thread_data);
// Write thread_data of each work item at index to the global buffer
int global_index =
item.get_group(2) * item.get_local_range().get(2) +
item.get_local_id(2); // Each thread_data has 4 elements
#pragma unroll
for (int i = 0; i < 4; ++i) {
data_accessor_write[global_index * 4 + i] = thread_data[i];
}
});
});
q.wait_and_throw();

sycl::host_accessor data_accessor(buffer_out, sycl::read_only);
const int *ptr = data_accessor.get_multi_ptr<sycl::access::decorated::yes>();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a big deal from my side, but host_accessor overloads operator[], so we don't really need a pointer.

return helper_validation_function<T>(ptr, "test_group_load");
}

bool test_load_subgroup_striped_standalone() {
// Tests dpct::group::load_subgroup_striped as standalone method
sycl::queue q(dpct::get_default_queue());
int data[512];
for (int i = 0; i < 512; i++)
data[i] = i;
sycl::buffer<int, 1> buffer(data, 512);
sycl::buffer<uint32_t, 1> sg_sz_buf{sycl::range<1>(1)};
int data_out[512];
for (int i = 0; i < 512; i++)
data_out[i] = 0;
sycl::buffer<int, 1> buffer_out(data_out, 512);

q.submit([&](sycl::handler &h) {
sycl::accessor dacc_read(buffer, h, sycl::read_only);
sycl::accessor dacc_write(buffer_out, h, sycl::write_only);
sycl::accessor sg_sz_dacc(sg_sz_buf, h, sycl::read_write);
h.parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
[=](sycl::nd_item<3> item) {
int thread_data[4];
auto *d_r =
dacc_read.get_multi_ptr<sycl::access::decorated::yes>().get();
auto *sg_sz_acc =
sg_sz_dacc.get_multi_ptr<sycl::access::decorated::yes>().get();
size_t gid = item.get_global_linear_id();
if (gid == 0) {
sg_sz_acc[0] = item.get_sub_group().get_local_linear_range();
}
dpct::group::uninitialized_load_subgroup_striped<4, int>(item, d_r,
thread_data);
// Write thread_data of each work item at index to the global buffer
int global_index =
(item.get_group(2) * item.get_local_range().get(2)) +
item.get_local_id(2); // Each thread_data has 4 elements
#pragma unroll
for (int i = 0; i < 4; ++i) {
dacc_write[global_index * 4 + i] = thread_data[i];
}
});
});
q.wait_and_throw();

sycl::host_accessor data_accessor(buffer_out, sycl::read_only);
const int *ptr = data_accessor.get_multi_ptr<sycl::access::decorated::yes>();
sycl::host_accessor data_accessor_sg(sg_sz_buf, sycl::read_only);
const uint32_t *ptr_sg =
data_accessor_sg.get_multi_ptr<sycl::access::decorated::yes>();
return subgroup_helper_validation_function(
ptr, ptr_sg, "test_subgroup_striped_standalone");
}

template <dpct::group::load_algorithm T> bool test_group_load_standalone() {
// Tests dpct::group::load_algorithm::BLOCK_LOAD_DIRECT &
// dpct::group::load_algorithm::BLOCK_LOAD_STRIPED as standalone methods
sycl::queue q(dpct::get_default_queue());
int data[512];
for (int i = 0; i < 512; i++)
data[i] = i;
sycl::buffer<int, 1> buffer(data, 512);
int data_out[512];
for (int i = 0; i < 512; i++)
data_out[i] = 0;
sycl::buffer<int, 1> buffer_out(data_out, 512);

q.submit([&](sycl::handler &h) {
sycl::accessor dacc_read(buffer, h, sycl::read_only);
sycl::accessor dacc_write(buffer_out, h, sycl::write_only);
h.parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
[=](sycl::nd_item<3> item) {
int thread_data[4];
auto *d_r =
dacc_read.get_multi_ptr<sycl::access::decorated::yes>().get();
if (T == dpct::group::load_algorithm::BLOCK_LOAD_DIRECT) {
dpct::group::load_blocked<4, int>(item, d_r, thread_data);
} else {
dpct::group::load_striped<4, int>(item, d_r, thread_data);
}
// Write thread_data of each work item at index to the global buffer
int global_index =
item.get_group(2) * item.get_local_range().get(2) +
item.get_local_id(2); // Each thread_data has 4 elements
#pragma unroll
for (int i = 0; i < 4; ++i) {
dacc_write[global_index * 4 + i] = thread_data[i];
}
});
});
q.wait_and_throw();

sycl::host_accessor data_accessor(buffer_out, sycl::read_only);
const int *ptr = data_accessor.get_multi_ptr<sycl::access::decorated::yes>();
return helper_validation_function<T>(ptr, "test_group_load");
}

int main() {

return !(
// Calls test_group_load with blocked and striped strategies , should pass
// both results.
test_group_load<dpct::group::load_algorithm::BLOCK_LOAD_DIRECT>() &&
test_group_load<dpct::group::load_algorithm::BLOCK_LOAD_STRIPED>() &&
// Calls test_load_subgroup_striped_standalone and should pass
test_load_subgroup_striped_standalone() &&
// Calls test_group_load_standalone with blocked and striped strategies as
// free functions, should pass both results.
test_group_load_standalone<
dpct::group::load_algorithm::BLOCK_LOAD_STRIPED>() &&
test_group_load_standalone<
dpct::group::load_algorithm::BLOCK_LOAD_DIRECT>());
}