Skip to content

Commit fddcc6b

Browse files
committed
IUtilities downloadBufferRangeViaStagingBuffer utility function
+ usage in example 48 and changed buffers to deviceLocal and fixed validation errors
1 parent d716e76 commit fddcc6b

File tree

2 files changed

+215
-85
lines changed

2 files changed

+215
-85
lines changed

examples_tests/48.ArithmeticUnitTest/main.cpp

Lines changed: 29 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -187,22 +187,14 @@ constexpr uint32_t kBufferSize = (1u+BUFFER_DWORD_COUNT)*sizeof(uint32_t);
187187

188188
//returns true if result matches
189189
template<template<class> class Arithmetic, template<class> class OP>
190-
bool validateResults(ILogicalDevice* device, const uint32_t* inputData, const uint32_t workgroupSize, const uint32_t workgroupCount, video::IGPUBuffer* bufferToRead, system::ILogger* logger)
190+
bool validateResults(ILogicalDevice* device, IUtilities* utilities, IGPUQueue* transferDownQueue, const uint32_t* inputData, const uint32_t workgroupSize, const uint32_t workgroupCount, video::IGPUBuffer* bufferToRead, asset::ICPUBuffer* resultsBuffer, system::ILogger* logger)
191191
{
192192
bool success = true;
193193

194-
auto mem = bufferToRead->getBoundMemory();
195-
// (Erfan->Cyprian): see this old code below -> we don't have getMappingCaps function anymore because we can deduce it from memorytype's property flags
196-
// also since they are now `core::bitflag` you can use .hasFlags instead of using &
197-
// if (mem->getMappingCaps()&IDeviceMemoryAllocation::EMCF_COHERENT)
198-
// Also I added a ! because the if check should've been the other way around
199-
if(!mem->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
200-
{
201-
IDeviceMemoryAllocation::MappedMemoryRange rng = {mem,0u,kBufferSize};
202-
device->invalidateMappedMemoryRanges(1u,&rng);
203-
}
194+
SBufferRange<IGPUBuffer> bufferRange = {0u, kBufferSize, core::smart_refctd_ptr<IGPUBuffer>(bufferToRead)};
195+
utilities->downloadBufferRangeViaStagingBuffer(transferDownQueue, bufferRange, resultsBuffer->getPointer());
204196

205-
auto dataFromBuffer = reinterpret_cast<uint32_t*>(mem->getMappedPointer());
197+
auto dataFromBuffer = reinterpret_cast<uint32_t*>(resultsBuffer->getPointer());
206198
const uint32_t subgroupSize = (*dataFromBuffer++);
207199

208200
// TODO: parallel for
@@ -242,7 +234,7 @@ constexpr const auto outputBufferCount = 8u;
242234

243235
template<template<class> class Arithmetic>
244236
bool runTest(
245-
ILogicalDevice* device, IGPUQueue* queue, IGPUFence* reusableFence, IGPUCommandBuffer* cmdbuf, IGPUComputePipeline* pipeline, const IGPUDescriptorSet* ds,
237+
ILogicalDevice* device, IUtilities* utilities, IGPUQueue* transferDownQueue, IGPUQueue* queue, IGPUFence* reusableFence, IGPUCommandBuffer* cmdbuf, IGPUComputePipeline* pipeline, const IGPUDescriptorSet* ds,
246238
const uint32_t* inputData, const uint32_t workgroupSize, core::smart_refctd_ptr<IGPUBuffer>* const buffers, system::ILogger* logger, bool is_workgroup_test = false)
247239
{
248240
// TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`)
@@ -275,18 +267,19 @@ bool runTest(
275267
queue->submit(1u,&submit,reusableFence);
276268
device->blockForFences(1u,&reusableFence);
277269
device->resetFences(1u,&reusableFence);
278-
270+
271+
auto resultsBuffer = core::make_smart_refctd_ptr<ICPUBuffer>(kBufferSize);
279272
//check results
280-
bool passed = validateResults<Arithmetic,and_op>(device, inputData, workgroupSize, workgroupCount, buffers[0].get(),logger);
281-
passed = validateResults<Arithmetic,xor_op>(device, inputData, workgroupSize, workgroupCount, buffers[1].get(),logger)&&passed;
282-
passed = validateResults<Arithmetic,or_op>(device, inputData, workgroupSize, workgroupCount, buffers[2].get(),logger)&&passed;
283-
passed = validateResults<Arithmetic,add_op>(device, inputData, workgroupSize, workgroupCount, buffers[3].get(),logger)&&passed;
284-
passed = validateResults<Arithmetic,mul_op>(device, inputData, workgroupSize, workgroupCount, buffers[4].get(),logger)&&passed;
285-
passed = validateResults<Arithmetic,min_op>(device, inputData, workgroupSize, workgroupCount, buffers[5].get(),logger)&&passed;
286-
passed = validateResults<Arithmetic,max_op>(device, inputData, workgroupSize, workgroupCount, buffers[6].get(),logger)&&passed;
273+
bool passed = validateResults<Arithmetic,and_op>(device, utilities, transferDownQueue, inputData, workgroupSize, workgroupCount, buffers[0].get(), resultsBuffer.get(),logger);
274+
passed = validateResults<Arithmetic,xor_op>(device, utilities, transferDownQueue, inputData, workgroupSize, workgroupCount, buffers[1].get(), resultsBuffer.get(),logger)&&passed;
275+
passed = validateResults<Arithmetic,or_op>(device, utilities, transferDownQueue, inputData, workgroupSize, workgroupCount, buffers[2].get(), resultsBuffer.get(),logger)&&passed;
276+
passed = validateResults<Arithmetic,add_op>(device, utilities, transferDownQueue, inputData, workgroupSize, workgroupCount, buffers[3].get(), resultsBuffer.get(),logger)&&passed;
277+
passed = validateResults<Arithmetic,mul_op>(device, utilities, transferDownQueue, inputData, workgroupSize, workgroupCount, buffers[4].get(), resultsBuffer.get(),logger)&&passed;
278+
passed = validateResults<Arithmetic,min_op>(device, utilities, transferDownQueue, inputData, workgroupSize, workgroupCount, buffers[5].get(), resultsBuffer.get(),logger)&&passed;
279+
passed = validateResults<Arithmetic,max_op>(device, utilities, transferDownQueue, inputData, workgroupSize, workgroupCount, buffers[6].get(), resultsBuffer.get(),logger)&&passed;
287280
if(is_workgroup_test)
288281
{
289-
passed = validateResults<Arithmetic,ballot>(device, inputData, workgroupSize, workgroupCount, buffers[7].get(),logger)&&passed;
282+
passed = validateResults<Arithmetic,ballot>(device, utilities, transferDownQueue, inputData, workgroupSize, workgroupCount, buffers[7].get(), resultsBuffer.get(), logger)&&passed;
290283
}
291284

292285
return passed;
@@ -305,7 +298,7 @@ class ArythmeticUnitTestApp : public NonGraphicalApplicationBase
305298
void onAppInitialized_impl() override
306299
{
307300
CommonAPI::InitOutput initOutput;
308-
CommonAPI::InitWithNoExt(initOutput, video::EAT_OPENGL, "Subgroup Arithmetic Test");
301+
CommonAPI::InitWithNoExt(initOutput, video::EAT_VULKAN, "Subgroup Arithmetic Test");
309302
gl = std::move(initOutput.apiConnection);
310303
gpuPhysicalDevice = std::move(initOutput.physicalDevice);
311304
logicalDevice = std::move(initOutput.logicalDevice);
@@ -319,6 +312,8 @@ class ArythmeticUnitTestApp : public NonGraphicalApplicationBase
319312
cpu2gpuParams = std::move(initOutput.cpu2gpuParams);
320313
utilities = std::move(initOutput.utilities);
321314

315+
auto transferDownQueue = queues[CommonAPI::InitOutput::EQT_TRANSFER_DOWN];
316+
322317
nbl::video::IGPUObjectFromAssetConverter cpu2gpu;
323318

324319
inputData = new uint32_t[BUFFER_DWORD_COUNT];
@@ -330,7 +325,7 @@ class ArythmeticUnitTestApp : public NonGraphicalApplicationBase
330325

331326
IGPUBuffer::SCreationParams inputDataBufferCreationParams = {};
332327
inputDataBufferCreationParams.size = kBufferSize;
333-
// inputDataBufferCreationParams.usage = ; TODO: Usage should not be EUF_NONE
328+
inputDataBufferCreationParams.usage = core::bitflag<IGPUBuffer::E_USAGE_FLAGS>(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT;
334329
auto gpuinputDataBuffer = utilities->createFilledDeviceLocalBufferOnDedMem(queues[decltype(initOutput)::EQT_TRANSFER_UP], std::move(inputDataBufferCreationParams), inputData);
335330

336331
//create 8 buffers.
@@ -340,56 +335,19 @@ class ArythmeticUnitTestApp : public NonGraphicalApplicationBase
340335
for (auto i = 0; i < outputBufferCount; i++)
341336
{
342337
IGPUBuffer::SCreationParams params;
343-
// (Erfan to Cyprian) note that IGPUBuffer::SCreationParams::size would get ignored before and was passed into the create function as a paramter or in "reqs" but it's very important now and make sure they are set when you're replacing the old usages
344338
params.size = kBufferSize;
345339
params.queueFamilyIndexCount = 0;
346340
params.queueFamilyIndices = nullptr;
347-
params.sharingMode = ESM_CONCURRENT;
341+
params.sharingMode = ESM_EXCLUSIVE;
348342
params.usage = core::bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT)|IGPUBuffer::EUF_TRANSFER_SRC_BIT;
349343

350-
// Notes from Erfan to Cyprian (Delete when Read)
351-
//
352-
// OLD CODE:
353-
// IDeviceMemoryBacked::SDeviceMemoryRequirements reqs;
354-
// reqs.memoryHeapLocation = IDeviceMemoryAllocation::ESMT_DEVICE_LOCAL;
355-
// reqs.mappingCapability = IDeviceMemoryAllocation::EMCAF_READ;
356-
//
357-
// Mindset about memoryTypeBits:
358-
// memoryTypes can each represent a combination of usages, for example HOST_READABLE, DEVICE_LOCAL, ... (see E_MEMORY_TYPE_FLAGS)
359-
// each memoryType (combination of usages) supported by your GPU will be reported in physicalDevice->getMemoryProperties()
360-
// each bit in a MemoryTypeBits represents an index that maps to a memoryType
361-
// So for example if you use physicalDevice->getDeviceLocalMemoryTypeBits() and it returns 0x0000'0003 means there are 2 memory types that have DEVICE_LOCAL flag (index = 0 and 1)
362-
// See IPhysicalDevice::getXXXXXMemoryTypeBits and the comments above the functions
363-
//
364-
// Each createXXXOnDedMem is now replaced by 3 steps ->
365-
// 1. Create buffer/image
366-
// 2. Get It's memory requirements and &= it's memoryTypeBits with the user requested usages
367-
// for example when you see old API usages that use device->getDownStreamingMemoryReqs, you would replace it by reqs &= physicalDevice->getDownStreamingMemoryTypeBits
368-
// 3. use allocate and pass the reqs + image/buffer for dedication
369-
// -> note that previously used memoryCapability is now decided by memoryTypeBits which you can query from PhysDev
370-
// (see commented code above) Previously memoryHeapLocation was ESMT_DEVICE_LOCAL and mapping capability was EMCAD_READ -> so we're looking for a "DEVICE_LOCAL" AND "HOST_READABLE" memoryType
371-
//
372-
// You can access physicalDevice pointer directly in some places or via device->getPhysicalDevice()
373-
//
374-
// Note that previously here the writer of example didn't use helper functions device->getXXXRequiremtns and filled in the reqs themselves because probably none of those would return the requiements of their need
375-
// In that case you could use physDev->getMemoryTypeBitsFromMemoryTypeFlags and pass in the correct Memory Property Flags to fulfill user's "previous" wish
376-
377344
buffers[i] = logicalDevice->createBuffer(params);
378345
auto mreq = buffers[i]->getMemoryReqs();
379-
mreq.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getMemoryTypeBitsFromMemoryTypeFlags(video::IDeviceMemoryAllocation::EMPF_DEVICE_LOCAL_BIT);
380-
mreq.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getMemoryTypeBitsFromMemoryTypeFlags(video::IDeviceMemoryAllocation::EMPF_HOST_READABLE_BIT);
381-
382-
// (Erfan to Cyprian) assert memoryTypeBits is not 0 because if it is then it means there is no memoryType that is both DeviceLocal and HostReadable
346+
mreq.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
347+
383348
assert(mreq.memoryTypeBits);
384-
// (Erfan to Cyprian) We usually don't use the return value (SMemoryOffset) of the allocate function but make sure you set it to some variable named bufferMem or something so we know it's there
385349
auto bufferMem = logicalDevice->allocate(mreq, buffers[i].get());
386350
assert(bufferMem.isValid());
387-
388-
IDeviceMemoryAllocation::MappedMemoryRange mem;
389-
mem.memory = buffers[i]->getBoundMemory();
390-
mem.offset = 0u;
391-
mem.length = kBufferSize;
392-
logicalDevice->mapMemory(mem, IDeviceMemoryAllocation::EMCAF_READ);
393351
}
394352

395353
IGPUDescriptorSetLayout::SBinding binding[totalBufferCount];
@@ -399,7 +357,7 @@ class ArythmeticUnitTestApp : public NonGraphicalApplicationBase
399357

400358
constexpr uint32_t pushconstantSize = 8u * totalBufferCount;
401359
SPushConstantRange pcRange[1] = { IShader::ESS_COMPUTE,0u,pushconstantSize };
402-
auto pipelineLayout = logicalDevice->createPipelineLayout(pcRange, pcRange + pushconstantSize, core::smart_refctd_ptr(gpuDSLayout));
360+
auto pipelineLayout = logicalDevice->createPipelineLayout(pcRange, pcRange + 1u, core::smart_refctd_ptr(gpuDSLayout));
403361

404362
auto descPool = logicalDevice->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, &gpuDSLayout.get(), &gpuDSLayout.get() + 1u);
405363
auto descriptorSet = logicalDevice->createDescriptorSet(descPool.get(), core::smart_refctd_ptr(gpuDSLayout));
@@ -475,17 +433,17 @@ class ArythmeticUnitTestApp : public NonGraphicalApplicationBase
475433
bool passed = true;
476434

477435
const video::IGPUDescriptorSet* ds = descriptorSet.get();
478-
passed = runTest<emulatedSubgroupReduction>(logicalDevice.get(), computeQueue, fence.get(), cmdbuf.get(), pipelines[0u].get(), descriptorSet.get(), inputData, workgroupSize, buffers, logger.get()) && passed;
436+
passed = runTest<emulatedSubgroupReduction>(logicalDevice.get(), utilities.get(), transferDownQueue, computeQueue, fence.get(), cmdbuf.get(), pipelines[0u].get(), descriptorSet.get(), inputData, workgroupSize, buffers, logger.get()) && passed;
479437
logTestOutcome(passed, workgroupSize);
480-
passed = runTest<emulatedSubgroupScanExclusive>(logicalDevice.get(), computeQueue, fence.get(), cmdbuf.get(), pipelines[1u].get(), descriptorSet.get(), inputData, workgroupSize, buffers, logger.get()) && passed;
438+
passed = runTest<emulatedSubgroupScanExclusive>(logicalDevice.get(), utilities.get(), transferDownQueue, computeQueue, fence.get(), cmdbuf.get(), pipelines[1u].get(), descriptorSet.get(), inputData, workgroupSize, buffers, logger.get()) && passed;
481439
logTestOutcome(passed, workgroupSize);
482-
passed = runTest<emulatedSubgroupScanInclusive>(logicalDevice.get(), computeQueue, fence.get(), cmdbuf.get(), pipelines[2u].get(), descriptorSet.get(), inputData, workgroupSize, buffers, logger.get()) && passed;
440+
passed = runTest<emulatedSubgroupScanInclusive>(logicalDevice.get(), utilities.get(), transferDownQueue, computeQueue, fence.get(), cmdbuf.get(), pipelines[2u].get(), descriptorSet.get(), inputData, workgroupSize, buffers, logger.get()) && passed;
483441
logTestOutcome(passed, workgroupSize);
484-
passed = runTest<emulatedWorkgroupReduction>(logicalDevice.get(), computeQueue, fence.get(), cmdbuf.get(), pipelines[3u].get(), descriptorSet.get(), inputData, workgroupSize, buffers, logger.get(), true) && passed;
442+
passed = runTest<emulatedWorkgroupReduction>(logicalDevice.get(), utilities.get(), transferDownQueue, computeQueue, fence.get(), cmdbuf.get(), pipelines[3u].get(), descriptorSet.get(), inputData, workgroupSize, buffers, logger.get(), true) && passed;
485443
logTestOutcome(passed, workgroupSize);
486-
passed = runTest<emulatedWorkgroupScanExclusive>(logicalDevice.get(), computeQueue, fence.get(), cmdbuf.get(), pipelines[4u].get(), descriptorSet.get(), inputData, workgroupSize, buffers, logger.get(), true) && passed;
444+
passed = runTest<emulatedWorkgroupScanExclusive>(logicalDevice.get(), utilities.get(), transferDownQueue, computeQueue, fence.get(), cmdbuf.get(), pipelines[4u].get(), descriptorSet.get(), inputData, workgroupSize, buffers, logger.get(), true) && passed;
487445
logTestOutcome(passed, workgroupSize);
488-
passed = runTest<emulatedWorkgroupScanInclusive>(logicalDevice.get(), computeQueue, fence.get(), cmdbuf.get(), pipelines[5u].get(), descriptorSet.get(), inputData, workgroupSize, buffers, logger.get(), true) && passed;
446+
passed = runTest<emulatedWorkgroupScanInclusive>(logicalDevice.get(), utilities.get(), transferDownQueue, computeQueue, fence.get(), cmdbuf.get(), pipelines[5u].get(), descriptorSet.get(), inputData, workgroupSize, buffers, logger.get(), true) && passed;
489447
logTestOutcome(passed, workgroupSize);
490448
}
491449
computeQueue->endCapture();

0 commit comments

Comments
 (0)