1010
1111#include < string>
1212
13+ template <int Dim, typename T> struct KernelFunctor {
14+ int *mCorrectResultFlag ;
15+ T mClusterLaunchProperty ;
16+ sycl::range<Dim> mClusterRange ;
17+ KernelFunctor (int *CorrectResultFlag, T ClusterLaunchProperty,
18+ sycl::range<Dim> ClusterRange)
19+ : mCorrectResultFlag (CorrectResultFlag),
20+ mClusterLaunchProperty (ClusterLaunchProperty),
21+ mClusterRange(ClusterRange) {}
22+
23+ void operator ()(sycl::nd_item<Dim> It) const {
24+ uint32_t ClusterDimX, ClusterDimY, ClusterDimZ;
25+ // Temporary solution till cluster group class is implemented
26+ #if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_CUDA_ARCH__) && \
27+ (__SYCL_CUDA_ARCH__ >= 900 )
28+ asm volatile (" \n\t "
29+ " mov.u32 %0, %%cluster_nctaid.x; \n\t "
30+ " mov.u32 %1, %%cluster_nctaid.y; \n\t "
31+ " mov.u32 %2, %%cluster_nctaid.z; \n\t "
32+ : " =r" (ClusterDimZ), " =r" (ClusterDimY), " =r" (ClusterDimX));
33+ #endif
34+ if constexpr (Dim == 1 ) {
35+ if (ClusterDimZ == mClusterRange [0 ] && ClusterDimY == 1 &&
36+ ClusterDimX == 1 ) {
37+ *mCorrectResultFlag = 1 ;
38+ }
39+ } else if constexpr (Dim == 2 ) {
40+ if (ClusterDimZ == mClusterRange [1 ] && ClusterDimY == mClusterRange [0 ] &&
41+ ClusterDimX == 1 ) {
42+ *mCorrectResultFlag = 1 ;
43+ }
44+ } else {
45+ if (ClusterDimZ == mClusterRange [2 ] && ClusterDimY == mClusterRange [1 ] &&
46+ ClusterDimX == mClusterRange [0 ]) {
47+ *mCorrectResultFlag = 1 ;
48+ }
49+ }
50+ }
51+ auto get (sycl::ext::oneapi::experimental::properties_tag) const {
52+ return mClusterLaunchProperty ;
53+ }
54+ };
55+
1356template <int Dim>
1457int test_cluster_launch_parallel_for (sycl::queue &Queue,
1558 sycl::range<Dim> GlobalRange,
@@ -25,38 +68,10 @@ int test_cluster_launch_parallel_for(sycl::queue &Queue,
2568
2669 Queue
2770 .submit ([&](sycl::handler &CGH) {
28- CGH.parallel_for (sycl::nd_range<Dim>(GlobalRange, LocalRange),
29- ClusterLaunchProperty, [=](sycl::nd_item<Dim> It) {
30- uint32_t ClusterDimX, ClusterDimY, ClusterDimZ;
31- // Temporary solution till cluster group class is implemented
32- #if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_CUDA_ARCH__) && \
33- (__SYCL_CUDA_ARCH__ >= 900 )
34- asm volatile (" \n\t "
35- " mov.u32 %0, %%cluster_nctaid.x; \n\t "
36- " mov.u32 %1, %%cluster_nctaid.y; \n\t "
37- " mov.u32 %2, %%cluster_nctaid.z; \n\t "
38- : " =r" (ClusterDimZ), " =r" (ClusterDimY),
39- " =r" (ClusterDimX));
40- #endif
41- if constexpr (Dim == 1 ) {
42- if (ClusterDimZ == ClusterRange[0 ] &&
43- ClusterDimY == 1 && ClusterDimX == 1 ) {
44- *CorrectResultFlag = 1 ;
45- }
46- } else if constexpr (Dim == 2 ) {
47- if (ClusterDimZ == ClusterRange[1 ] &&
48- ClusterDimY == ClusterRange[0 ] &&
49- ClusterDimX == 1 ) {
50- *CorrectResultFlag = 1 ;
51- }
52- } else {
53- if (ClusterDimZ == ClusterRange[2 ] &&
54- ClusterDimY == ClusterRange[1 ] &&
55- ClusterDimX == ClusterRange[0 ]) {
56- *CorrectResultFlag = 1 ;
57- }
58- }
59- });
71+ CGH.parallel_for (
72+ sycl::nd_range<Dim>(GlobalRange, LocalRange),
73+ KernelFunctor<Dim, decltype (ClusterLaunchProperty)>(
74+ CorrectResultFlag, ClusterLaunchProperty, ClusterRange));
6075 })
6176 .wait_and_throw ();
6277
0 commit comments