@@ -187,22 +187,14 @@ constexpr uint32_t kBufferSize = (1u+BUFFER_DWORD_COUNT)*sizeof(uint32_t);
187
187
188
188
// returns true if result matches
189
189
template <template <class > class Arithmetic , template <class > class OP >
190
- bool validateResults (ILogicalDevice* device, const uint32_t * inputData, const uint32_t workgroupSize, const uint32_t workgroupCount, video::IGPUBuffer* bufferToRead, system::ILogger* logger)
190
+ bool validateResults (ILogicalDevice* device, IUtilities* utilities, IGPUQueue* transferDownQueue, const uint32_t * inputData, const uint32_t workgroupSize, const uint32_t workgroupCount, video::IGPUBuffer* bufferToRead, asset::ICPUBuffer* resultsBuffer , system::ILogger* logger)
191
191
{
192
192
bool success = true ;
193
193
194
- auto mem = bufferToRead->getBoundMemory ();
195
- // (Erfan->Cyprian): see this old code below -> we don't have getMappingCaps function anymore because we can deduce it from memorytype's property flags
196
- // also since they are now `core::bitflag` you can use .hasFlags instead of using &
197
- // if (mem->getMappingCaps()&IDeviceMemoryAllocation::EMCF_COHERENT)
198
- // Also I added a ! because the if check should've been the other way around
199
- if (!mem->getMemoryPropertyFlags ().hasFlags (IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
200
- {
201
- IDeviceMemoryAllocation::MappedMemoryRange rng = {mem,0u ,kBufferSize };
202
- device->invalidateMappedMemoryRanges (1u ,&rng);
203
- }
194
+ SBufferRange<IGPUBuffer> bufferRange = {0u , kBufferSize , core::smart_refctd_ptr<IGPUBuffer>(bufferToRead)};
195
+ utilities->downloadBufferRangeViaStagingBuffer (transferDownQueue, bufferRange, resultsBuffer->getPointer ());
204
196
205
- auto dataFromBuffer = reinterpret_cast <uint32_t *>(mem-> getMappedPointer ());
197
+ auto dataFromBuffer = reinterpret_cast <uint32_t *>(resultsBuffer-> getPointer ());
206
198
const uint32_t subgroupSize = (*dataFromBuffer++);
207
199
208
200
// TODO: parallel for
@@ -242,7 +234,7 @@ constexpr const auto outputBufferCount = 8u;
242
234
243
235
template <template <class > class Arithmetic >
244
236
bool runTest (
245
- ILogicalDevice* device, IGPUQueue* queue, IGPUFence* reusableFence, IGPUCommandBuffer* cmdbuf, IGPUComputePipeline* pipeline, const IGPUDescriptorSet* ds,
237
+ ILogicalDevice* device, IUtilities* utilities, IGPUQueue* transferDownQueue, IGPUQueue* queue, IGPUFence* reusableFence, IGPUCommandBuffer* cmdbuf, IGPUComputePipeline* pipeline, const IGPUDescriptorSet* ds,
246
238
const uint32_t * inputData, const uint32_t workgroupSize, core::smart_refctd_ptr<IGPUBuffer>* const buffers, system::ILogger* logger, bool is_workgroup_test = false )
247
239
{
248
240
// TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`)
@@ -275,18 +267,19 @@ bool runTest(
275
267
queue->submit (1u ,&submit,reusableFence);
276
268
device->blockForFences (1u ,&reusableFence);
277
269
device->resetFences (1u ,&reusableFence);
278
-
270
+
271
+ auto resultsBuffer = core::make_smart_refctd_ptr<ICPUBuffer>(kBufferSize );
279
272
// check results
280
- bool passed = validateResults<Arithmetic,and_op>(device, inputData, workgroupSize, workgroupCount, buffers[0 ].get (),logger);
281
- passed = validateResults<Arithmetic,xor_op>(device, inputData, workgroupSize, workgroupCount, buffers[1 ].get (),logger)&&passed;
282
- passed = validateResults<Arithmetic,or_op>(device, inputData, workgroupSize, workgroupCount, buffers[2 ].get (),logger)&&passed;
283
- passed = validateResults<Arithmetic,add_op>(device, inputData, workgroupSize, workgroupCount, buffers[3 ].get (),logger)&&passed;
284
- passed = validateResults<Arithmetic,mul_op>(device, inputData, workgroupSize, workgroupCount, buffers[4 ].get (),logger)&&passed;
285
- passed = validateResults<Arithmetic,min_op>(device, inputData, workgroupSize, workgroupCount, buffers[5 ].get (),logger)&&passed;
286
- passed = validateResults<Arithmetic,max_op>(device, inputData, workgroupSize, workgroupCount, buffers[6 ].get (),logger)&&passed;
273
+ bool passed = validateResults<Arithmetic,and_op>(device, utilities, transferDownQueue, inputData, workgroupSize, workgroupCount, buffers[0 ]. get (), resultsBuffer .get (),logger);
274
+ passed = validateResults<Arithmetic,xor_op>(device, utilities, transferDownQueue, inputData, workgroupSize, workgroupCount, buffers[1 ]. get (), resultsBuffer .get (),logger)&&passed;
275
+ passed = validateResults<Arithmetic,or_op>(device, utilities, transferDownQueue, inputData, workgroupSize, workgroupCount, buffers[2 ]. get (), resultsBuffer .get (),logger)&&passed;
276
+ passed = validateResults<Arithmetic,add_op>(device, utilities, transferDownQueue, inputData, workgroupSize, workgroupCount, buffers[3 ]. get (), resultsBuffer .get (),logger)&&passed;
277
+ passed = validateResults<Arithmetic,mul_op>(device, utilities, transferDownQueue, inputData, workgroupSize, workgroupCount, buffers[4 ]. get (), resultsBuffer .get (),logger)&&passed;
278
+ passed = validateResults<Arithmetic,min_op>(device, utilities, transferDownQueue, inputData, workgroupSize, workgroupCount, buffers[5 ]. get (), resultsBuffer .get (),logger)&&passed;
279
+ passed = validateResults<Arithmetic,max_op>(device, utilities, transferDownQueue, inputData, workgroupSize, workgroupCount, buffers[6 ]. get (), resultsBuffer .get (),logger)&&passed;
287
280
if (is_workgroup_test)
288
281
{
289
- passed = validateResults<Arithmetic,ballot>(device, inputData, workgroupSize, workgroupCount, buffers[7 ].get (),logger)&&passed;
282
+ passed = validateResults<Arithmetic,ballot>(device, utilities, transferDownQueue, inputData, workgroupSize, workgroupCount, buffers[7 ].get (), resultsBuffer. get (), logger)&&passed;
290
283
}
291
284
292
285
return passed;
@@ -305,7 +298,7 @@ class ArythmeticUnitTestApp : public NonGraphicalApplicationBase
305
298
void onAppInitialized_impl () override
306
299
{
307
300
CommonAPI::InitOutput initOutput;
308
- CommonAPI::InitWithNoExt (initOutput, video::EAT_OPENGL , " Subgroup Arithmetic Test" );
301
+ CommonAPI::InitWithNoExt (initOutput, video::EAT_VULKAN , " Subgroup Arithmetic Test" );
309
302
gl = std::move (initOutput.apiConnection );
310
303
gpuPhysicalDevice = std::move (initOutput.physicalDevice );
311
304
logicalDevice = std::move (initOutput.logicalDevice );
@@ -319,6 +312,8 @@ class ArythmeticUnitTestApp : public NonGraphicalApplicationBase
319
312
cpu2gpuParams = std::move (initOutput.cpu2gpuParams );
320
313
utilities = std::move (initOutput.utilities );
321
314
315
+ auto transferDownQueue = queues[CommonAPI::InitOutput::EQT_TRANSFER_DOWN];
316
+
322
317
nbl::video::IGPUObjectFromAssetConverter cpu2gpu;
323
318
324
319
inputData = new uint32_t [BUFFER_DWORD_COUNT];
@@ -330,7 +325,7 @@ class ArythmeticUnitTestApp : public NonGraphicalApplicationBase
330
325
331
326
IGPUBuffer::SCreationParams inputDataBufferCreationParams = {};
332
327
inputDataBufferCreationParams.size = kBufferSize ;
333
- // inputDataBufferCreationParams.usage = ; TODO: Usage should not be EUF_NONE
328
+ inputDataBufferCreationParams.usage = core::bitflag<IGPUBuffer::E_USAGE_FLAGS>(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT;
334
329
auto gpuinputDataBuffer = utilities->createFilledDeviceLocalBufferOnDedMem (queues[decltype (initOutput)::EQT_TRANSFER_UP], std::move (inputDataBufferCreationParams), inputData);
335
330
336
331
// create 8 buffers.
@@ -340,56 +335,19 @@ class ArythmeticUnitTestApp : public NonGraphicalApplicationBase
340
335
for (auto i = 0 ; i < outputBufferCount; i++)
341
336
{
342
337
IGPUBuffer::SCreationParams params;
343
- // (Erfan to Cyprian) note that IGPUBuffer::SCreationParams::size would get ignored before and was passed into the create function as a paramter or in "reqs" but it's very important now and make sure they are set when you're replacing the old usages
344
338
params.size = kBufferSize ;
345
339
params.queueFamilyIndexCount = 0 ;
346
340
params.queueFamilyIndices = nullptr ;
347
- params.sharingMode = ESM_CONCURRENT ;
341
+ params.sharingMode = ESM_EXCLUSIVE ;
348
342
params.usage = core::bitflag (IGPUBuffer::EUF_STORAGE_BUFFER_BIT)|IGPUBuffer::EUF_TRANSFER_SRC_BIT;
349
343
350
- // Notes from Erfan to Cyprian (Delete when Read)
351
- //
352
- // OLD CODE:
353
- // IDeviceMemoryBacked::SDeviceMemoryRequirements reqs;
354
- // reqs.memoryHeapLocation = IDeviceMemoryAllocation::ESMT_DEVICE_LOCAL;
355
- // reqs.mappingCapability = IDeviceMemoryAllocation::EMCAF_READ;
356
- //
357
- // Mindset about memoryTypeBits:
358
- // memoryTypes can each represent a combination of usages, for example HOST_READABLE, DEVICE_LOCAL, ... (see E_MEMORY_TYPE_FLAGS)
359
- // each memoryType (combination of usages) supported by your GPU will be reported in physicalDevice->getMemoryProperties()
360
- // each bit in a MemoryTypeBits represents an index that maps to a memoryType
361
- // So for example if you use physicalDevice->getDeviceLocalMemoryTypeBits() and it returns 0x0000'0003 means there are 2 memory types that have DEVICE_LOCAL flag (index = 0 and 1)
362
- // See IPhysicalDevice::getXXXXXMemoryTypeBits and the comments above the functions
363
- //
364
- // Each createXXXOnDedMem is now replaced by 3 steps ->
365
- // 1. Create buffer/image
366
- // 2. Get It's memory requirements and &= it's memoryTypeBits with the user requested usages
367
- // for example when you see old API usages that use device->getDownStreamingMemoryReqs, you would replace it by reqs &= physicalDevice->getDownStreamingMemoryTypeBits
368
- // 3. use allocate and pass the reqs + image/buffer for dedication
369
- // -> note that previously used memoryCapability is now decided by memoryTypeBits which you can query from PhysDev
370
- // (see commented code above) Previously memoryHeapLocation was ESMT_DEVICE_LOCAL and mapping capability was EMCAD_READ -> so we're looking for a "DEVICE_LOCAL" AND "HOST_READABLE" memoryType
371
- //
372
- // You can access physicalDevice pointer directly in some places or via device->getPhysicalDevice()
373
- //
374
- // Note that previously here the writer of example didn't use helper functions device->getXXXRequiremtns and filled in the reqs themselves because probably none of those would return the requiements of their need
375
- // In that case you could use physDev->getMemoryTypeBitsFromMemoryTypeFlags and pass in the correct Memory Property Flags to fulfill user's "previous" wish
376
-
377
344
buffers[i] = logicalDevice->createBuffer (params);
378
345
auto mreq = buffers[i]->getMemoryReqs ();
379
- mreq.memoryTypeBits &= logicalDevice->getPhysicalDevice ()->getMemoryTypeBitsFromMemoryTypeFlags (video::IDeviceMemoryAllocation::EMPF_DEVICE_LOCAL_BIT);
380
- mreq.memoryTypeBits &= logicalDevice->getPhysicalDevice ()->getMemoryTypeBitsFromMemoryTypeFlags (video::IDeviceMemoryAllocation::EMPF_HOST_READABLE_BIT);
381
-
382
- // (Erfan to Cyprian) assert memoryTypeBits is not 0 because if it is then it means there is no memoryType that is both DeviceLocal and HostReadable
346
+ mreq.memoryTypeBits &= logicalDevice->getPhysicalDevice ()->getDeviceLocalMemoryTypeBits ();
347
+
383
348
assert (mreq.memoryTypeBits );
384
- // (Erfan to Cyprian) We usually don't use the return value (SMemoryOffset) of the allocate function but make sure you set it to some variable named bufferMem or something so we know it's there
385
349
auto bufferMem = logicalDevice->allocate (mreq, buffers[i].get ());
386
350
assert (bufferMem.isValid ());
387
-
388
- IDeviceMemoryAllocation::MappedMemoryRange mem;
389
- mem.memory = buffers[i]->getBoundMemory ();
390
- mem.offset = 0u ;
391
- mem.length = kBufferSize ;
392
- logicalDevice->mapMemory (mem, IDeviceMemoryAllocation::EMCAF_READ);
393
351
}
394
352
395
353
IGPUDescriptorSetLayout::SBinding binding[totalBufferCount];
@@ -399,7 +357,7 @@ class ArythmeticUnitTestApp : public NonGraphicalApplicationBase
399
357
400
358
constexpr uint32_t pushconstantSize = 8u * totalBufferCount;
401
359
SPushConstantRange pcRange[1 ] = { IShader::ESS_COMPUTE,0u ,pushconstantSize };
402
- auto pipelineLayout = logicalDevice->createPipelineLayout (pcRange, pcRange + pushconstantSize , core::smart_refctd_ptr (gpuDSLayout));
360
+ auto pipelineLayout = logicalDevice->createPipelineLayout (pcRange, pcRange + 1u , core::smart_refctd_ptr (gpuDSLayout));
403
361
404
362
auto descPool = logicalDevice->createDescriptorPoolForDSLayouts (IDescriptorPool::ECF_NONE, &gpuDSLayout.get (), &gpuDSLayout.get () + 1u );
405
363
auto descriptorSet = logicalDevice->createDescriptorSet (descPool.get (), core::smart_refctd_ptr (gpuDSLayout));
@@ -475,17 +433,17 @@ class ArythmeticUnitTestApp : public NonGraphicalApplicationBase
475
433
bool passed = true ;
476
434
477
435
const video::IGPUDescriptorSet* ds = descriptorSet.get ();
478
- passed = runTest<emulatedSubgroupReduction>(logicalDevice.get (), computeQueue, fence.get (), cmdbuf.get (), pipelines[0u ].get (), descriptorSet.get (), inputData, workgroupSize, buffers, logger.get ()) && passed;
436
+ passed = runTest<emulatedSubgroupReduction>(logicalDevice.get (), utilities. get (), transferDownQueue, computeQueue, fence.get (), cmdbuf.get (), pipelines[0u ].get (), descriptorSet.get (), inputData, workgroupSize, buffers, logger.get ()) && passed;
479
437
logTestOutcome (passed, workgroupSize);
480
- passed = runTest<emulatedSubgroupScanExclusive>(logicalDevice.get (), computeQueue, fence.get (), cmdbuf.get (), pipelines[1u ].get (), descriptorSet.get (), inputData, workgroupSize, buffers, logger.get ()) && passed;
438
+ passed = runTest<emulatedSubgroupScanExclusive>(logicalDevice.get (), utilities. get (), transferDownQueue, computeQueue, fence.get (), cmdbuf.get (), pipelines[1u ].get (), descriptorSet.get (), inputData, workgroupSize, buffers, logger.get ()) && passed;
481
439
logTestOutcome (passed, workgroupSize);
482
- passed = runTest<emulatedSubgroupScanInclusive>(logicalDevice.get (), computeQueue, fence.get (), cmdbuf.get (), pipelines[2u ].get (), descriptorSet.get (), inputData, workgroupSize, buffers, logger.get ()) && passed;
440
+ passed = runTest<emulatedSubgroupScanInclusive>(logicalDevice.get (), utilities. get (), transferDownQueue, computeQueue, fence.get (), cmdbuf.get (), pipelines[2u ].get (), descriptorSet.get (), inputData, workgroupSize, buffers, logger.get ()) && passed;
483
441
logTestOutcome (passed, workgroupSize);
484
- passed = runTest<emulatedWorkgroupReduction>(logicalDevice.get (), computeQueue, fence.get (), cmdbuf.get (), pipelines[3u ].get (), descriptorSet.get (), inputData, workgroupSize, buffers, logger.get (), true ) && passed;
442
+ passed = runTest<emulatedWorkgroupReduction>(logicalDevice.get (), utilities. get (), transferDownQueue, computeQueue, fence.get (), cmdbuf.get (), pipelines[3u ].get (), descriptorSet.get (), inputData, workgroupSize, buffers, logger.get (), true ) && passed;
485
443
logTestOutcome (passed, workgroupSize);
486
- passed = runTest<emulatedWorkgroupScanExclusive>(logicalDevice.get (), computeQueue, fence.get (), cmdbuf.get (), pipelines[4u ].get (), descriptorSet.get (), inputData, workgroupSize, buffers, logger.get (), true ) && passed;
444
+ passed = runTest<emulatedWorkgroupScanExclusive>(logicalDevice.get (), utilities. get (), transferDownQueue, computeQueue, fence.get (), cmdbuf.get (), pipelines[4u ].get (), descriptorSet.get (), inputData, workgroupSize, buffers, logger.get (), true ) && passed;
487
445
logTestOutcome (passed, workgroupSize);
488
- passed = runTest<emulatedWorkgroupScanInclusive>(logicalDevice.get (), computeQueue, fence.get (), cmdbuf.get (), pipelines[5u ].get (), descriptorSet.get (), inputData, workgroupSize, buffers, logger.get (), true ) && passed;
446
+ passed = runTest<emulatedWorkgroupScanInclusive>(logicalDevice.get (), utilities. get (), transferDownQueue, computeQueue, fence.get (), cmdbuf.get (), pipelines[5u ].get (), descriptorSet.get (), inputData, workgroupSize, buffers, logger.get (), true ) && passed;
489
447
logTestOutcome (passed, workgroupSize);
490
448
}
491
449
computeQueue->endCapture ();
0 commit comments