Skip to content

Commit 7316957

Browse files
committed
add the start of a test for cl_intel_concurrent_dispatch
1 parent b1a1675 commit 7316957

File tree

3 files changed

+243
-0
lines changed

3 files changed

+243
-0
lines changed
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Copyright (c) 2024 Ben Ashbaugh
2+
#
3+
# SPDX-License-Identifier: MIT
4+
5+
add_opencl_sample(
6+
TEST
7+
NUMBER 17
8+
TARGET concurrentdispatch
9+
VERSION 200 # for clSetKernelExecInfo
10+
SOURCES main.cpp
11+
LIBS OpenCLExt)
Lines changed: 231 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,231 @@
1+
/*
2+
// Copyright (c) 2024 Ben Ashbaugh
3+
//
4+
// SPDX-License-Identifier: MIT
5+
*/
6+
7+
#include <popl/popl.hpp>
8+
9+
#include <CL/opencl.hpp>
10+
11+
#include "util.hpp"
12+
13+
// TODO: clean this up once support is in the upstream headers.
14+
#if !defined(cl_intel_concurrent_dispatch)
15+
16+
#define cl_intel_concurrent_dispatch 1
17+
#define CL_INTEL_CONCURRENT_DISPATCH_EXTENSION_NAME \
18+
"cl_intel_concurrent_dispatch"
19+
20+
#define CL_INTEL_CONCURRENT_DISPATCH_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)
21+
22+
/* cl_kernel_exec_info */
23+
#define CL_KERNEL_EXEC_INFO_DISPATCH_TYPE_INTEL 0x4257
24+
25+
typedef cl_uint cl_kernel_exec_info_dispatch_type_intel;
26+
27+
/* cl_kernel_exec_info_dispatch_type_intel */
28+
#define CL_KERNEL_EXEC_INFO_DISPATCH_TYPE_DEFAULT_INTEL 0
29+
#define CL_KERNEL_EXEC_INFO_DISPATCH_TYPE_CONCURRENT_INTEL 1
30+
31+
typedef cl_int CL_API_CALL
32+
clGetKernelMaxConcurrentWorkGroupCountINTEL_t(
33+
cl_command_queue command_queue,
34+
cl_kernel kernel,
35+
cl_uint work_dim,
36+
const size_t* global_work_offset,
37+
const size_t* local_work_size,
38+
size_t* max_work_group_count);
39+
40+
typedef clGetKernelMaxConcurrentWorkGroupCountINTEL_t *
41+
clGetKernelMaxConcurrentWorkGroupCountINTEL_fn ;
42+
43+
#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
44+
45+
extern CL_API_ENTRY cl_int CL_API_CALL
46+
clGetKernelMaxConcurrentWorkGroupCountINTEL(
47+
cl_command_queue command_queue,
48+
cl_kernel kernel,
49+
cl_uint work_dim,
50+
const size_t* global_work_offset,
51+
const size_t* local_work_size,
52+
size_t* max_work_group_count) ;
53+
54+
#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
55+
56+
#endif // !defined(cl_intel_concurrent_dispatch)
57+
58+
static const char kernelString[] = R"CLC(
59+
#pragma OPENCL EXTENSION cl_intel_concurrent_dispatch : enable
60+
kernel void DeviceBarrierTest( global uint* dst )
61+
{
62+
const size_t gws = get_global_size(0);
63+
atomic_add( &dst[gws], 1 );
64+
65+
//if (intel_is_device_barrier_valid()) {
66+
//intel_device_barrier( CLK_LOCAL_MEM_FENCE ); // TODO: check fence flags
67+
//intel_device_barrier( CLK_LOCAL_MEM_FENCE, memory_scope_device ); // TODO: check fence flags
68+
//}
69+
70+
const uint id = get_global_id(0);
71+
dst[id] = dst[gws] + 1;
72+
}
73+
)CLC";
74+
75+
int main(int argc, char** argv)
76+
{
77+
int platformIndex = 0;
78+
int deviceIndex = 0;
79+
80+
size_t iterations = 16;
81+
size_t lws = 64;
82+
size_t wgCount = 0;
83+
84+
{
85+
popl::OptionParser op("Supported Options");
86+
op.add<popl::Value<int>>("p", "platform", "Platform Index", platformIndex, &platformIndex);
87+
op.add<popl::Value<int>>("d", "device", "Device Index", deviceIndex, &deviceIndex);
88+
op.add<popl::Value<size_t>>("i", "iterations", "Iterations", iterations, &iterations);
89+
op.add<popl::Value<size_t>>("", "lws", "Local Work-Group Size", lws, &lws);
90+
91+
bool printUsage = false;
92+
try {
93+
op.parse(argc, argv);
94+
} catch (std::exception& e) {
95+
fprintf(stderr, "Error: %s\n\n", e.what());
96+
printUsage = true;
97+
}
98+
99+
if (printUsage || !op.unknown_options().empty() || !op.non_option_args().empty()) {
100+
fprintf(stderr,
101+
"Usage: concurrentdispatch [options]\n"
102+
"%s", op.help().c_str());
103+
return -1;
104+
}
105+
}
106+
107+
std::vector<cl::Platform> platforms;
108+
cl::Platform::get(&platforms);
109+
110+
printf("Running on platform: %s\n",
111+
platforms[platformIndex].getInfo<CL_PLATFORM_NAME>().c_str() );
112+
113+
std::vector<cl::Device> devices;
114+
platforms[platformIndex].getDevices(CL_DEVICE_TYPE_ALL, &devices);
115+
116+
printf("Running on device: %s\n",
117+
devices[deviceIndex].getInfo<CL_DEVICE_NAME>().c_str() );
118+
119+
if (checkDeviceForExtension(devices[deviceIndex], CL_INTEL_CONCURRENT_DISPATCH_EXTENSION_NAME)) {
120+
printf("Device supports " CL_INTEL_CONCURRENT_DISPATCH_EXTENSION_NAME ".\n");
121+
} else {
122+
printf("Device does not support " CL_INTEL_CONCURRENT_DISPATCH_EXTENSION_NAME ".\n");
123+
return -1;
124+
}
125+
126+
cl::Context context{devices[deviceIndex]};
127+
cl::CommandQueue commandQueue{context, devices[deviceIndex]};
128+
129+
cl::Program program{ context, kernelString };
130+
program.build("-cl-std=CL3.0");
131+
cl::Kernel kernel = cl::Kernel{ program, "DeviceBarrierTest" };
132+
133+
cl_kernel_exec_info_dispatch_type_intel dispatchType =
134+
CL_KERNEL_EXEC_INFO_DISPATCH_TYPE_CONCURRENT_INTEL;
135+
kernel.setExecInfo(CL_KERNEL_EXEC_INFO_DISPATCH_TYPE_INTEL, dispatchType);
136+
137+
auto clGetKernelMaxConcurrentWorkGroupCountINTEL_ = (clGetKernelMaxConcurrentWorkGroupCountINTEL_fn)
138+
clGetExtensionFunctionAddressForPlatform(
139+
platforms[platformIndex](),
140+
"clGetKernelMaxConcurrentWorkGroupCountINTEL");
141+
clGetKernelMaxConcurrentWorkGroupCountINTEL_(
142+
commandQueue(),
143+
kernel(),
144+
1,
145+
nullptr,
146+
&lws,
147+
&wgCount);
148+
149+
printf("Max concurrent work-group count for local work size %zu is %zu.\n",
150+
lws, wgCount);
151+
152+
const size_t gws = lws * wgCount;
153+
154+
cl::Buffer dst = cl::Buffer{
155+
context,
156+
CL_MEM_READ_WRITE,
157+
(gws + 1) * sizeof(cl_uint) };
158+
159+
// execution
160+
{
161+
kernel.setArg(0, dst);
162+
163+
commandQueue.finish();
164+
165+
auto start = std::chrono::system_clock::now();
166+
for (size_t i = 0; i < iterations; i++) {
167+
cl_uint zero = 0;
168+
commandQueue.enqueueFillBuffer(
169+
dst,
170+
zero,
171+
0,
172+
(gws + 1) * sizeof(cl_uint));
173+
commandQueue.enqueueNDRangeKernel(
174+
kernel,
175+
cl::NullRange,
176+
cl::NDRange{gws},
177+
cl::NDRange{lws});
178+
}
179+
180+
commandQueue.finish();
181+
182+
auto end = std::chrono::system_clock::now();
183+
std::chrono::duration<float> elapsed_seconds = end - start;
184+
printf("Finished in %f seconds\n", elapsed_seconds.count());
185+
}
186+
187+
// verification
188+
{
189+
const cl_uint* pDst = (const cl_uint*)commandQueue.enqueueMapBuffer(
190+
dst,
191+
CL_TRUE,
192+
CL_MAP_READ,
193+
0,
194+
(gws + 1) * sizeof(cl_uint) );
195+
196+
size_t mismatches = 0;
197+
198+
for( size_t i = 0; i < gws + 1; i++ )
199+
{
200+
uint check = (i == gws) ? gws : gws + 1;
201+
if( pDst[i] != check )
202+
{
203+
if( mismatches < 16 )
204+
{
205+
fprintf(stderr, "MisMatch! dst[%zu] == %08X, want %08X\n",
206+
i,
207+
pDst[i],
208+
check );
209+
}
210+
mismatches++;
211+
}
212+
}
213+
214+
if( mismatches )
215+
{
216+
fprintf(stderr, "Error: Found %zu mismatches / %zu values!!!\n",
217+
mismatches,
218+
gws + 1 );
219+
}
220+
else
221+
{
222+
printf("Success.\n");
223+
}
224+
225+
commandQueue.enqueueUnmapMemObject(
226+
dst,
227+
(void*)pDst );
228+
}
229+
230+
return 0;
231+
}

samples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,4 +90,5 @@ if(BUILD_EXTENSION_SAMPLES)
9090
add_subdirectory( 13_mutablecommandbuffers )
9191
add_subdirectory( 14_ooqcommandbuffers )
9292
add_subdirectory( 15_mutablecommandbufferasserts )
93+
add_subdirectory( 17_concurrentdispatch )
9394
endif()

0 commit comments

Comments
 (0)