22// XFAIL: (opencl && !cpu && !accelerator)
33// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14641
44
5- // RUN: %{build} -I . -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %}
5+ // TODO: Currently using the -Wno-deprecated-declarations flag due to issue
6+ // https://github.com/intel/llvm/issues/16451. Rewrite testRootGroup() amd
7+ // remove the flag once the issue is resolved.
8+ // RUN: %{build} -I . -o %t.out -Wno-deprecated-declarations %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %}
69// RUN: %{run} %t.out
710
811// Disabled temporarily while investigation into the failure is ongoing.
@@ -60,6 +63,34 @@ void testQueriesAndProperties() {
6063 check_max_num_work_group_sync (maxWGsWithLimits);
6164}
6265
66+ template <typename T> struct TestKernel1 {
67+ T m_data;
68+ TestKernel1 (T &data_) : m_data(data_) {}
69+ void operator ()(sycl::nd_item<1 > it) const {
70+ volatile float X = 1 .0f ;
71+ volatile float Y = 1 .0f ;
72+ auto root = it.ext_oneapi_get_root_group ();
73+ m_data[root.get_local_id ()] = root.get_local_id ();
74+ sycl::group_barrier (root);
75+ // Delay half of the workgroups with extra work to check that the barrier
76+ // synchronizes the whole device.
77+ if (it.get_group (0 ) % 2 == 0 ) {
78+ X += sycl::sin (X);
79+ Y += sycl::cos (Y);
80+ }
81+ root = sycl::ext::oneapi::experimental::this_work_item::get_root_group<1 >();
82+ int sum = m_data[root.get_local_id ()] +
83+ m_data[root.get_local_range () - root.get_local_id () - 1 ];
84+ sycl::group_barrier (root);
85+ m_data[root.get_local_id ()] = sum;
86+ }
87+ auto get (sycl::ext::oneapi::experimental::properties_tag) const {
88+ return sycl::ext::oneapi::experimental::properties{
89+ sycl::ext::oneapi::experimental::use_root_sync};
90+ ;
91+ }
92+ };
93+
6394void testRootGroup () {
6495 sycl::queue q;
6596 const auto bundle =
@@ -70,41 +101,13 @@ void testRootGroup() {
70101 .ext_oneapi_get_info <sycl::ext::oneapi::experimental::info::
71102 kernel_queue_specific::max_num_work_groups>(
72103 q, WorkGroupSize, 0 );
104+ const auto props = sycl::ext::oneapi::experimental::properties{
105+ sycl::ext::oneapi::experimental::use_root_sync};
73106 sycl::buffer<int > dataBuf{sycl::range{maxWGs * WorkGroupSize}};
74107 const auto range = sycl::nd_range<1 >{maxWGs * WorkGroupSize, WorkGroupSize};
75- struct TestKernel1 {
76- sycl::buffer<int > *m_dataBuf;
77- sycl::handler *m_h;
78- TestKernel1 (sycl::buffer<int > *dataBuf, sycl::handler *h)
79- : m_dataBuf(dataBuf), m_h(h) {}
80- void operator ()(sycl::nd_item<1 > it) const {
81- sycl::accessor data{*m_dataBuf, *m_h};
82- volatile float X = 1 .0f ;
83- volatile float Y = 1 .0f ;
84- auto root = it.ext_oneapi_get_root_group ();
85- data[root.get_local_id ()] = root.get_local_id ();
86- sycl::group_barrier (root);
87- // Delay half of the workgroups with extra work to check that the barrier
88- // synchronizes the whole device.
89- if (it.get_group (0 ) % 2 == 0 ) {
90- X += sycl::sin (X);
91- Y += sycl::cos (Y);
92- }
93- root =
94- sycl::ext::oneapi::experimental::this_work_item::get_root_group<1 >();
95- int sum = data[root.get_local_id ()] +
96- data[root.get_local_range () - root.get_local_id () - 1 ];
97- sycl::group_barrier (root);
98- data[root.get_local_id ()] = sum;
99- }
100- auto get (sycl::ext::oneapi::experimental::properties_tag) const {
101- return sycl::ext::oneapi::experimental::properties{
102- sycl::ext::oneapi::experimental::use_root_sync};
103- ;
104- }
105- };
106108 q.submit ([&](sycl::handler &h) {
107- h.parallel_for <class RootGroupKernel >(range, TestKernel1 (&dataBuf, &h));
109+ sycl::accessor data{dataBuf, h};
110+ h.parallel_for <class RootGroupKernel >(range, TestKernel1 (data));
108111 });
109112 sycl::host_accessor data{dataBuf};
110113 const int workItemCount = static_cast <int >(range.get_global_range ().size ());
0 commit comments