@@ -18,7 +18,7 @@ struct and
18
18
_NBL_STATIC_INLINE_CONSTEXPR T IdentityElement = ~0ull ; // this should be a reinterpret cast
19
19
20
20
inline T operator ()(T left, T right) { return left & right; }
21
-
21
+ _NBL_STATIC_INLINE_CONSTEXPR bool runOPonFirst = false ;
22
22
_NBL_STATIC_INLINE_CONSTEXPR const char * name = " and" ;
23
23
};
24
24
template <typename T>
@@ -28,7 +28,7 @@ struct xor
28
28
_NBL_STATIC_INLINE_CONSTEXPR T IdentityElement = 0ull ; // this should be a reinterpret cast
29
29
30
30
inline T operator ()(T left, T right) { return left ^ right; }
31
-
31
+ _NBL_STATIC_INLINE_CONSTEXPR bool runOPonFirst = false ;
32
32
_NBL_STATIC_INLINE_CONSTEXPR const char * name = " xor" ;
33
33
};
34
34
template <typename T>
@@ -38,7 +38,7 @@ struct or
38
38
_NBL_STATIC_INLINE_CONSTEXPR T IdentityElement = 0ull ; // this should be a reinterpret cast
39
39
40
40
inline T operator ()(T left, T right) { return left | right; }
41
-
41
+ _NBL_STATIC_INLINE_CONSTEXPR bool runOPonFirst = false ;
42
42
_NBL_STATIC_INLINE_CONSTEXPR const char * name = " or" ;
43
43
};
44
44
template <typename T>
@@ -48,7 +48,7 @@ struct add
48
48
_NBL_STATIC_INLINE_CONSTEXPR T IdentityElement = T(0 );
49
49
50
50
inline T operator ()(T left, T right) { return left + right; }
51
-
51
+ _NBL_STATIC_INLINE_CONSTEXPR bool runOPonFirst = false ;
52
52
_NBL_STATIC_INLINE_CONSTEXPR const char * name = " add" ;
53
53
};
54
54
template <typename T>
@@ -58,7 +58,7 @@ struct mul
58
58
_NBL_STATIC_INLINE_CONSTEXPR T IdentityElement = T(1 );
59
59
60
60
inline T operator ()(T left, T right) { return left * right; }
61
-
61
+ _NBL_STATIC_INLINE_CONSTEXPR bool runOPonFirst = false ;
62
62
_NBL_STATIC_INLINE_CONSTEXPR const char * name = " mul" ;
63
63
};
64
64
template <typename T>
@@ -68,7 +68,7 @@ struct min
68
68
_NBL_STATIC_INLINE_CONSTEXPR T IdentityElement = std::numeric_limits<T>::max();
69
69
70
70
inline T operator ()(T left, T right) { return std::min<T>(left, right); }
71
-
71
+ _NBL_STATIC_INLINE_CONSTEXPR bool runOPonFirst = false ;
72
72
_NBL_STATIC_INLINE_CONSTEXPR const char * name = " min" ;
73
73
};
74
74
template <typename T>
@@ -78,9 +78,11 @@ struct max
78
78
_NBL_STATIC_INLINE_CONSTEXPR T IdentityElement = std::numeric_limits<T>::lowest();
79
79
80
80
inline T operator ()(T left, T right) { return std::max<T>(left, right); }
81
-
81
+ _NBL_STATIC_INLINE_CONSTEXPR bool runOPonFirst = false ;
82
82
_NBL_STATIC_INLINE_CONSTEXPR const char * name = " max" ;
83
83
};
84
+ template <typename T>
85
+ struct ballot : add<T> {};
84
86
85
87
86
88
// subgroup method emulations on the CPU, to verify the results of the GPU methods
@@ -111,7 +113,6 @@ struct emulatedSubgroupReduction : emulatedSubgroupCommon<emulatedSubgroupReduct
111
113
red = OP ()(red,subgroupData[i]);
112
114
std::fill (outSubgroupData,outSubgroupData+clampedSubgroupSize,red);
113
115
}
114
-
115
116
_NBL_STATIC_INLINE_CONSTEXPR const char * name = " subgroup reduction" ;
116
117
};
117
118
template <class OP >
@@ -125,7 +126,6 @@ struct emulatedSubgroupScanExclusive : emulatedSubgroupCommon<emulatedSubgroupSc
125
126
for (auto i=1u ; i<clampedSubgroupSize; i++)
126
127
outSubgroupData[i] = OP ()(outSubgroupData[i-1u ],subgroupData[i-1u ]);
127
128
}
128
-
129
129
_NBL_STATIC_INLINE_CONSTEXPR const char * name = " subgroup exclusive scan" ;
130
130
};
131
131
template <class OP >
@@ -139,7 +139,6 @@ struct emulatedSubgroupScanInclusive : emulatedSubgroupCommon<emulatedSubgroupSc
139
139
for (auto i=1u ; i<clampedSubgroupSize; i++)
140
140
outSubgroupData[i] = OP ()(outSubgroupData[i-1u ],subgroupData[i]);
141
141
}
142
-
143
142
_NBL_STATIC_INLINE_CONSTEXPR const char * name = " subgroup inclusive scan" ;
144
143
};
145
144
@@ -151,12 +150,11 @@ struct emulatedWorkgroupReduction
151
150
152
151
inline void operator ()(type_t * outputData, const type_t * workgroupData, uint32_t workgroupSize, uint32_t subgroupSize)
153
152
{
154
- type_t red = workgroupData[0 ];
153
+ type_t red = OP::runOPonFirst ? OP ()( 0 , workgroupData[ 0 ]) : workgroupData[0 ];
155
154
for (auto i=1u ; i<workgroupSize; i++)
156
155
red = OP ()(red,workgroupData[i]);
157
156
std::fill (outputData,outputData+workgroupSize,red);
158
157
}
159
-
160
158
_NBL_STATIC_INLINE_CONSTEXPR const char * name = " workgroup reduction" ;
161
159
};
162
160
template <class OP >
@@ -170,7 +168,6 @@ struct emulatedWorkgroupScanExclusive
170
168
for (auto i=1u ; i<workgroupSize; i++)
171
169
outputData[i] = OP ()(outputData[i-1u ],workgroupData[i-1u ]);
172
170
}
173
-
174
171
_NBL_STATIC_INLINE_CONSTEXPR const char * name = " workgroup exclusive scan" ;
175
172
};
176
173
template <class OP >
@@ -184,14 +181,14 @@ struct emulatedWorkgroupScanInclusive
184
181
for (auto i=1u ; i<workgroupSize; i++)
185
182
outputData[i] = OP ()(outputData[i-1u ],workgroupData[i]);
186
183
}
187
-
188
184
_NBL_STATIC_INLINE_CONSTEXPR const char * name = " workgroup inclusive scan" ;
189
185
};
190
186
191
187
192
188
#include " common.glsl"
193
189
constexpr uint32_t kBufferSize = BUFFER_DWORD_COUNT*sizeof (uint32_t );
194
190
191
+
195
192
// returns true if result matches
196
193
template <template <class > class Arithmetic , template <class > class OP >
197
194
bool validateResults (video::IVideoDriver* driver, const uint32_t * inputData, const uint32_t workgroupSize, const uint32_t workgroupCount, video::IGPUBuffer* bufferToDownload)
@@ -228,18 +225,27 @@ bool validateResults(video::IVideoDriver* driver, const uint32_t* inputData, con
228
225
// now check if the data obtained has valid values
229
226
constexpr uint32_t subgroupSize = 4u ;
230
227
uint32_t * tmp = new uint32_t [workgroupSize];
228
+ uint32_t * ballotInput = new uint32_t [workgroupSize];
231
229
for (uint32_t workgroupID=0u ; success&&workgroupID<workgroupCount; workgroupID++)
232
230
{
233
231
const auto workgroupOffset = workgroupID*workgroupSize;
234
- Arithmetic<OP<uint32_t >>()(tmp,inputData+workgroupOffset,workgroupSize,subgroupSize);
232
+ if constexpr (std::is_same_v<OP<uint32_t >,ballot<uint32_t >>)
233
+ {
234
+ for (auto i=0u ; i<workgroupSize; i++)
235
+ ballotInput[i] = inputData[i+workgroupOffset]&0x1u ;
236
+ Arithmetic<OP<uint32_t >>()(tmp,ballotInput,workgroupSize,subgroupSize);
237
+ }
238
+ else
239
+ Arithmetic<OP<uint32_t >>()(tmp,inputData+workgroupOffset,workgroupSize,subgroupSize);
235
240
for (uint32_t localInvocationIndex=0u ; localInvocationIndex<workgroupSize; localInvocationIndex++)
236
241
if (tmp[localInvocationIndex]!=dataFromBuffer[workgroupOffset+localInvocationIndex])
237
242
{
238
- os::Printer::log (" Failed test #" + std::to_string (workgroupSize) + " (" + Arithmetic<OP<uint32_t >>::name + " ) (" + OP<uint32_t >::name + " )" , ELL_ERROR);
243
+ os::Printer::log (" Failed test #" + std::to_string (workgroupSize) + " (" + Arithmetic<OP<uint32_t >>::name + " ) (" + OP<uint32_t >::name + " ) Expected " + std::to_string (tmp[localInvocationIndex])+ " got " + std::to_string (dataFromBuffer[workgroupOffset + localInvocationIndex]) , ELL_ERROR);
239
244
success = false ;
240
245
break ;
241
246
}
242
247
}
248
+ delete[] ballotInput;
243
249
delete[] tmp;
244
250
}
245
251
else
@@ -250,7 +256,7 @@ bool validateResults(video::IVideoDriver* driver, const uint32_t* inputData, con
250
256
251
257
}
252
258
template <template <class > class Arithmetic >
253
- bool runTest (video::IVideoDriver* driver, video::IGPUComputePipeline* pipeline, const video::IGPUDescriptorSet* ds, const uint32_t * inputData, const uint32_t workgroupSize, core::smart_refctd_ptr<IGPUBuffer>* const buffers)
259
+ bool runTest (video::IVideoDriver* driver, video::IGPUComputePipeline* pipeline, const video::IGPUDescriptorSet* ds, const uint32_t * inputData, const uint32_t workgroupSize, core::smart_refctd_ptr<IGPUBuffer>* const buffers, bool is_workgroup_test = false )
254
260
{
255
261
driver->bindComputePipeline (pipeline);
256
262
driver->bindDescriptorSets (video::EPBP_COMPUTE,pipeline->getLayout (),0u ,1u ,&ds,nullptr );
@@ -265,6 +271,11 @@ bool runTest(video::IVideoDriver* driver, video::IGPUComputePipeline* pipeline,
265
271
passed = validateResults<Arithmetic,mul>(driver, inputData, workgroupSize, workgroupCount, buffers[4 ].get ())&&passed;
266
272
passed = validateResults<Arithmetic,::min>(driver, inputData, workgroupSize, workgroupCount, buffers[5 ].get ())&&passed;
267
273
passed = validateResults<Arithmetic,::max>(driver, inputData, workgroupSize, workgroupCount, buffers[6 ].get ())&&passed;
274
+ if (is_workgroup_test)
275
+ {
276
+ passed = validateResults<Arithmetic,ballot>(driver, inputData, workgroupSize, workgroupCount, buffers[7 ].get ()) && passed;
277
+ }
278
+
268
279
return passed;
269
280
}
270
281
@@ -300,43 +311,41 @@ int main()
300
311
}
301
312
auto gpuinputDataBuffer = driver->createFilledDeviceLocalGPUBufferOnDedMem (kBufferSize , inputData);
302
313
303
- // create 7 buffers.
304
- core::smart_refctd_ptr<IGPUBuffer> buffers[7 ];
305
- for (size_t i = 0 ; i < 7 ; i++)
314
+ // create 8 buffers.
315
+ constexpr const int outputBufferCount = 8 ;
316
+ constexpr const int totalBufferCount = outputBufferCount+1 ;
317
+
318
+ core::smart_refctd_ptr<IGPUBuffer> buffers[outputBufferCount];
319
+ for (size_t i = 0 ; i < outputBufferCount; i++)
306
320
{
307
321
buffers[i] = driver->createDeviceLocalGPUBufferOnDedMem (kBufferSize );
308
322
}
309
323
310
- IGPUDescriptorSetLayout::SBinding binding[8 ] = {
311
- {0u ,EDT_STORAGE_BUFFER,1u ,IGPUSpecializedShader::ESS_COMPUTE,nullptr }, // input with randomized numbers
312
- {1u ,EDT_STORAGE_BUFFER,1u ,IGPUSpecializedShader::ESS_COMPUTE,nullptr },
313
- {2u ,EDT_STORAGE_BUFFER,1u ,IGPUSpecializedShader::ESS_COMPUTE,nullptr },
314
- {3u ,EDT_STORAGE_BUFFER,1u ,IGPUSpecializedShader::ESS_COMPUTE,nullptr },
315
- {4u ,EDT_STORAGE_BUFFER,1u ,IGPUSpecializedShader::ESS_COMPUTE,nullptr },
316
- {5u ,EDT_STORAGE_BUFFER,1u ,IGPUSpecializedShader::ESS_COMPUTE,nullptr },
317
- {6u ,EDT_STORAGE_BUFFER,1u ,IGPUSpecializedShader::ESS_COMPUTE,nullptr },
318
- {7u ,EDT_STORAGE_BUFFER,1u ,IGPUSpecializedShader::ESS_COMPUTE,nullptr },
319
- };
320
- auto gpuDSLayout = driver->createGPUDescriptorSetLayout (binding, binding + 8 );
321
- constexpr uint32_t pushconstantSize = 64u ;
324
+ IGPUDescriptorSetLayout::SBinding binding[totalBufferCount];
325
+ for (uint32_t i = 0u ; i < totalBufferCount; i++)
326
+ {
327
+ binding[i] = { i,EDT_STORAGE_BUFFER,1u ,IGPUSpecializedShader::ESS_COMPUTE,nullptr };
328
+ }
329
+ auto gpuDSLayout = driver->createGPUDescriptorSetLayout (binding, binding + totalBufferCount);
330
+ constexpr uint32_t pushconstantSize = 8u * totalBufferCount;
322
331
SPushConstantRange pcRange[1 ] = { IGPUSpecializedShader::ESS_COMPUTE,0u ,pushconstantSize };
323
332
auto pipelineLayout = driver->createGPUPipelineLayout (pcRange, pcRange + pushconstantSize, core::smart_refctd_ptr (gpuDSLayout));
324
333
325
334
auto descriptorSet = driver->createGPUDescriptorSet (core::smart_refctd_ptr (gpuDSLayout));
326
335
{
327
- IGPUDescriptorSet::SDescriptorInfo infos[8 ];
336
+ IGPUDescriptorSet::SDescriptorInfo infos[totalBufferCount ];
328
337
infos[0 ].desc = gpuinputDataBuffer;
329
338
infos[0 ].buffer = { 0u ,kBufferSize };
330
- for (uint32_t i=1u ; i<=7u ; i++)
339
+ for (uint32_t i=1u ; i<= outputBufferCount ; i++)
331
340
{
332
341
infos[i].desc = buffers[i - 1 ];
333
342
infos[i].buffer = { 0u ,kBufferSize };
334
343
335
344
}
336
- IGPUDescriptorSet::SWriteDescriptorSet writes[8 ];
337
- for (uint32_t i=0u ; i<8u ; i++)
345
+ IGPUDescriptorSet::SWriteDescriptorSet writes[totalBufferCount ];
346
+ for (uint32_t i=0u ; i< totalBufferCount ; i++)
338
347
writes[i] = { descriptorSet.get (),i,0u ,1u ,EDT_STORAGE_BUFFER,infos + i };
339
- driver->updateDescriptorSets (8 , writes, 0u , nullptr );
348
+ driver->updateDescriptorSets (totalBufferCount , writes, 0u , nullptr );
340
349
}
341
350
struct GLSLCodeWithWorkgroup {
342
351
uint32_t workgroup_definition_position;
@@ -391,9 +400,9 @@ int main()
391
400
passed = runTest<emulatedSubgroupReduction>(driver,pipelines[0u ].get (),descriptorSet.get (),inputData,workgroupSize,buffers)&&passed;
392
401
passed = runTest<emulatedSubgroupScanExclusive>(driver,pipelines[1u ].get (),descriptorSet.get (),inputData,workgroupSize,buffers)&&passed;
393
402
passed = runTest<emulatedSubgroupScanInclusive>(driver,pipelines[2u ].get (),descriptorSet.get (),inputData,workgroupSize,buffers)&&passed;
394
- passed = runTest<emulatedWorkgroupReduction>(driver,pipelines[3u ].get (),descriptorSet.get (),inputData,workgroupSize,buffers)&&passed;
395
- passed = runTest<emulatedWorkgroupScanExclusive>(driver,pipelines[4u ].get (),descriptorSet.get (),inputData,workgroupSize,buffers)&&passed;
396
- passed = runTest<emulatedWorkgroupScanInclusive>(driver,pipelines[5u ].get (),descriptorSet.get (),inputData,workgroupSize,buffers)&&passed;
403
+ passed = runTest<emulatedWorkgroupReduction>(driver,pipelines[3u ].get (),descriptorSet.get (),inputData,workgroupSize,buffers, true )&&passed;
404
+ passed = runTest<emulatedWorkgroupScanExclusive>(driver,pipelines[4u ].get (),descriptorSet.get (),inputData,workgroupSize,buffers, true )&&passed;
405
+ passed = runTest<emulatedWorkgroupScanInclusive>(driver,pipelines[5u ].get (),descriptorSet.get (),inputData,workgroupSize,buffers, true )&&passed;
397
406
398
407
if (passed)
399
408
os::Printer::log (" Passed test #" + std::to_string (workgroupSize), ELL_INFORMATION);
0 commit comments