@@ -21,6 +21,7 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul
21
21
using device_base_t = application_templates::BasicMultiQueueApplication;
22
22
using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
23
23
24
+ // TODO: would be cool if we used `system::ISystem::listItemsInDirectory(sharedInputCWD/"GLI")` as our dataset
24
25
static constexpr std::array imagesToLoad = {
25
26
" ../app_resources/test0.png" ,
26
27
" ../app_resources/test1.png" ,
@@ -54,15 +55,17 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul
54
55
m_histogramSavedSemaphore = m_device->createSemaphore (TIMELINE_SEMAPHORE_STARTING_VALUE);
55
56
56
57
// TODO: create/initialize array of atomic pointers to IGPUImage* and IGPUBuffer* to hold results
57
- // no need i think
58
58
59
+ // TODO: Change the capture start/end to become methods of IAPIConnection, because our current API is not how renderdoc works
60
+ getComputeQueue ()->startCapture ();
59
61
std::thread loadImagesThread (&StagingAndMultipleQueuesApp::loadImages, this );
60
62
std::thread saveHistogramsThread (&StagingAndMultipleQueuesApp::saveHistograms, this );
61
63
62
64
calculateHistograms ();
63
65
64
66
loadImagesThread.join ();
65
67
saveHistogramsThread.join ();
68
+ getComputeQueue ()->endCapture ();
66
69
67
70
return true ;
68
71
}
@@ -89,8 +92,8 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul
89
92
90
93
private:
91
94
smart_refctd_ptr<ISemaphore> m_imagesLoadedSemaphore, m_imagesProcessedSemaphore, m_histogramSavedSemaphore;
92
- std::atomic<uint32_t > m_imagesLoadedCnt, m_imagesProcessedCnt, m_imagesDownloadedCnt, m_imagesSavedCnt;
93
95
std::atomic<uint32_t > imageHandlesCreated = 0u ;
96
+ std::atomic<uint32_t > transfersSubmitted = 0u ;
94
97
std::array<core::smart_refctd_ptr<IGPUImage>, IMAGE_CNT> images;
95
98
96
99
static constexpr uint32_t FRAMES_IN_FLIGHT = 3u ;
@@ -104,34 +107,37 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul
104
107
105
108
void loadImages ()
106
109
{
110
+ const core::set<uint32_t > uniqueFamilyIndices = { getTransferUpQueue ()->getFamilyIndex (), getComputeQueue ()->getFamilyIndex () };
111
+ const std::vector<uint32_t > familyIndices (uniqueFamilyIndices.begin (),uniqueFamilyIndices.end ());
112
+ const bool multipleQueueFamilies = familyIndices.size ()>1 ;
113
+
107
114
IAssetLoader::SAssetLoadParams lp;
108
115
lp.logger = m_logger.get ();
109
116
110
117
auto transferUpQueue = getTransferUpQueue ();
111
- const core::bitflag<IGPUCommandPool::CREATE_FLAGS> commandPoolFlags = IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT;
112
118
std::array<core::smart_refctd_ptr<nbl::video::IGPUCommandPool>, FRAMES_IN_FLIGHT> commandPools;
113
119
std::array<core::smart_refctd_ptr<nbl::video::IGPUCommandBuffer>, FRAMES_IN_FLIGHT> commandBuffers;
114
120
std::fill (commandPools.begin (), commandPools.end (), nullptr );
115
121
116
122
core::smart_refctd_ptr<ICPUImage> cpuImages[IMAGE_CNT];
117
123
for (uint32_t i = 0u ; i < FRAMES_IN_FLIGHT; ++i)
118
124
{
125
+ const core::bitflag<IGPUCommandPool::CREATE_FLAGS> commandPoolFlags = IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT;
119
126
commandPools[i] = m_device->createCommandPool (transferUpQueue->getFamilyIndex (), commandPoolFlags);
120
127
commandPools[i]->createCommandBuffers (IGPUCommandPool::BUFFER_LEVEL::PRIMARY, {commandBuffers.data () + i, 1 }, core::smart_refctd_ptr (m_logger));
128
+ commandBuffers[i]->setObjectDebugName ((" Upload Command Buffer #" +std::to_string (i)).c_str ());
121
129
}
122
130
123
131
core::smart_refctd_ptr<ISemaphore> imgFillSemaphore = m_device->createSemaphore (0 );
124
- IQueue::SSubmitInfo::SSemaphoreInfo imgFillSemaphoreInfo[] =
125
- {
126
- {
132
+ imgFillSemaphore->setObjectDebugName (" Image Fill Semaphore" );
133
+ SIntendedSubmitInfo intendedSubmit = {
134
+ .queue = transferUpQueue,
135
+ .waitSemaphores = {},
136
+ .commandBuffers = {}, // fill later
137
+ .scratchSemaphore = {
127
138
.semaphore = imgFillSemaphore.get (),
128
- .value = 1 ,
139
+ .value = 0 ,
129
140
.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
130
- },
131
- {
132
- .semaphore = m_imagesLoadedSemaphore.get (),
133
- .value = 0xdeadbeef ,
134
- .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS
135
141
}
136
142
};
137
143
@@ -162,13 +168,15 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul
162
168
imgParams.arrayLayers = 1u ;
163
169
imgParams.samples = IImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT;
164
170
imgParams.usage = asset::IImage::EUF_TRANSFER_DST_BIT | asset::IImage::EUF_SAMPLED_BIT;
165
- // constexpr uint32_t FAMILY_INDICES_CNT = 3; // TODO: test on intel integrated GPU (which allows only one queue family)
166
- std::array familyIndices = { getTransferUpQueue ()->getFamilyIndex (), getComputeQueue ()->getFamilyIndex () };
167
- imgParams.queueFamilyIndexCount = familyIndices.size ();
168
- imgParams.queueFamilyIndices = familyIndices.data ();
171
+ if (multipleQueueFamilies)
172
+ {
173
+ imgParams.queueFamilyIndexCount = familyIndices.size ();
174
+ imgParams.queueFamilyIndices = familyIndices.data ();
175
+ }
169
176
imgParams.preinitialized = false ;
170
177
171
178
images[imageIdx] = m_device->createImage (std::move (imgParams));
179
+ images[imageIdx]->setObjectDebugName ((" Image #" +std::to_string (imageIdx)).c_str ());
172
180
auto imageAllocation = m_device->allocate (images[imageIdx]->getMemoryReqs (), images[imageIdx].get (), IDeviceMemoryAllocation::EMAF_NONE);
173
181
imageHandlesCreated++;
174
182
imageHandlesCreated.notify_one ();
@@ -206,13 +214,7 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul
206
214
}
207
215
208
216
IQueue::SSubmitInfo::SCommandBufferInfo imgFillCmdBuffInfo = { cmdBuff.get () };
209
-
210
- imgFillSemaphoreInfo[1 ].value = imageIdx + 1u ;
211
-
212
-
213
- SIntendedSubmitInfo intendedSubmit = {
214
- .frontHalf = {.queue = transferUpQueue, .waitSemaphores = {}, .commandBuffers = {&imgFillCmdBuffInfo, 1 }}, .signalSemaphores = imgFillSemaphoreInfo
215
- };
217
+ intendedSubmit.commandBuffers = {&imgFillCmdBuffInfo,1 };
216
218
217
219
cmdBuff->begin (IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
218
220
@@ -221,18 +223,17 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul
221
223
if (!cmdBuff->pipelineBarrier (E_DEPENDENCY_FLAGS::EDF_NONE, pplnBarrierDepInfo0))
222
224
logFailAndTerminate (" Failed to issue barrier!\n " );
223
225
224
-
225
- transferUpQueue->startCapture ();
226
- uint64_t oldCntr = imgFillSemaphoreInfo[0 ].value ;
226
+ const uint64_t oldCntr = intendedSubmit.scratchSemaphore .value ;
227
227
const bool uploadCommendRecorded = m_utils->updateImageViaStagingBuffer (
228
228
intendedSubmit, cpuImages[imageIdx]->getBuffer (), cpuImages[imageIdx]->getCreationParameters ().format ,
229
229
images[imageIdx].get (), IImage::LAYOUT::TRANSFER_DST_OPTIMAL, cpuImages[imageIdx]->getRegions ()
230
230
);
231
231
if (!uploadCommendRecorded)
232
232
logFailAndTerminate (" Couldn't update image data.\n " );
233
233
234
- if (imgFillSemaphoreInfo[0 ].value != oldCntr)
235
- m_logger->log (" %d overflows when uploading image %d!\n " , ILogger::ELL_PERFORMANCE, imgFillSemaphoreInfo[0 ].value - oldCntr, imageIdx);
234
+ const auto newCntr = intendedSubmit.scratchSemaphore .value ;
235
+ if (newCntr!=oldCntr)
236
+ m_logger->log (" %d overflows when uploading image %d!\n " , ILogger::ELL_PERFORMANCE, newCntr-oldCntr, imageIdx);
236
237
237
238
IGPUCommandBuffer::SPipelineBarrierDependencyInfo pplnBarrierDepInfo1;
238
239
pplnBarrierDepInfo1.imgBarriers = { &imageLayoutTransitionBarrier1, 1 };
@@ -242,10 +243,15 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul
242
243
243
244
cmdBuff->end ();
244
245
245
- intendedSubmit.advanceScratchSemaphoreValue ();
246
- IQueue::SSubmitInfo submitInfo[1 ] = { intendedSubmit };
247
- getTransferUpQueue ()->submit (submitInfo);
248
- transferUpQueue->endCapture ();
246
+ const IQueue::SSubmitInfo::SSemaphoreInfo signalSemaphore = {
247
+ .semaphore =m_imagesLoadedSemaphore.get (),
248
+ .value =imageIdx+1u ,
249
+ // cannot signal from COPY stage because there's a layout transition we need to wait for right after and it doesn't have an explicit stage
250
+ .stageMask =PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS
251
+ };
252
+ getTransferUpQueue ()->submit (intendedSubmit.popSubmit ({&signalSemaphore,1 }));
253
+ transfersSubmitted++;
254
+ transfersSubmitted.notify_one ();
249
255
250
256
251
257
// TODO: this is for basic testing purposes, will be deleted ofc
@@ -259,44 +265,48 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul
259
265
{
260
266
// INITIALIZE COMMON DATA
261
267
auto computeQueue = getComputeQueue ();
262
- const core::bitflag<IGPUCommandPool::CREATE_FLAGS> commandPoolFlags = static_cast <IGPUCommandPool::CREATE_FLAGS>(IGPUCommandPool::CREATE_FLAGS::NONE);
263
- std::array<core::smart_refctd_ptr<nbl::video::IGPUCommandPool>, FRAMES_IN_FLIGHT> commandPools;
264
- std::array<core::smart_refctd_ptr<nbl::video::IGPUCommandBuffer>, FRAMES_IN_FLIGHT> commandBuffers;
268
+
269
+ smart_refctd_ptr<IGPUDescriptorSetLayout> dsLayout;
265
270
core::smart_refctd_ptr<IGPUDescriptorSet> descSets[FRAMES_IN_FLIGHT];
266
- std::fill (commandPools.begin (), commandPools.end (), nullptr );
267
- nbl::video::IGPUDescriptorSetLayout::SBinding bindings[2 ] = {
268
- {
269
- .binding = 0 ,
270
- .type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
271
- .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
272
- .stageFlags = IGPUShader::E_SHADER_STAGE::ESS_COMPUTE,
273
- .count = 1 ,
274
- .samplers = nullptr
275
- },
271
+ {
272
+ nbl::video::IGPUDescriptorSetLayout::SBinding bindings[2 ] = {
273
+ {
274
+ .binding = 0 ,
275
+ .type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER, // TODO: just an image descriptor type when separable samplers arrive
276
+ .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
277
+ .stageFlags = IGPUShader::E_SHADER_STAGE::ESS_COMPUTE,
278
+ .count = 1 ,
279
+ .samplers = nullptr
280
+ },
281
+ {
282
+ .binding = 1 ,
283
+ .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER,
284
+ .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
285
+ .stageFlags = IGPUShader::E_SHADER_STAGE::ESS_COMPUTE,
286
+ .count = 1 ,
287
+ .samplers = nullptr
288
+ }
289
+ };
290
+
291
+ dsLayout = m_device->createDescriptorSetLayout (bindings);
292
+ if (!dsLayout)
293
+ logFailAndTerminate (" Failed to create a Descriptor Layout!\n " );
294
+ auto descPool = m_device->createDescriptorPoolForDSLayouts (IDescriptorPool::ECF_NONE, { &dsLayout.get (),1 }, &FRAMES_IN_FLIGHT);
295
+ for (uint32_t i = 0u ; i < FRAMES_IN_FLIGHT; ++i)
276
296
{
277
- .binding = 1 ,
278
- .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER,
279
- .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
280
- .stageFlags = IGPUShader::E_SHADER_STAGE::ESS_COMPUTE,
281
- .count = 1 ,
282
- .samplers = nullptr
297
+ descSets[i] = descPool->createDescriptorSet (core::smart_refctd_ptr (dsLayout));
298
+ descSets[i]->setObjectDebugName ((" Descriptor Set #" + std::to_string (i)).c_str ());
283
299
}
284
- };
285
- smart_refctd_ptr<IGPUDescriptorSetLayout> dsLayout[1 ] = { m_device->createDescriptorSetLayout (bindings) };
286
- if (!dsLayout[0 ])
287
- logFailAndTerminate (" Failed to create a Descriptor Layout!\n " );
288
- smart_refctd_ptr<nbl::video::IDescriptorPool> descPools[FRAMES_IN_FLIGHT] = { // TODO: only one desc pool?
289
- m_device->createDescriptorPoolForDSLayouts (IDescriptorPool::ECF_NONE, {&dsLayout[0 ].get (), 1 }),
290
- m_device->createDescriptorPoolForDSLayouts (IDescriptorPool::ECF_NONE, {&dsLayout[0 ].get (), 1 }),
291
- m_device->createDescriptorPoolForDSLayouts (IDescriptorPool::ECF_NONE, {&dsLayout[0 ].get (), 1 })
292
- };
300
+ }
293
301
302
+ std::array<core::smart_refctd_ptr<nbl::video::IGPUCommandPool>, FRAMES_IN_FLIGHT> commandPools;
303
+ std::array<core::smart_refctd_ptr<nbl::video::IGPUCommandBuffer>, FRAMES_IN_FLIGHT> commandBuffers;
294
304
for (uint32_t i = 0u ; i < FRAMES_IN_FLIGHT; ++i)
295
305
{
306
+ const core::bitflag<IGPUCommandPool::CREATE_FLAGS> commandPoolFlags = IGPUCommandPool::CREATE_FLAGS::NONE;
296
307
commandPools[i] = m_device->createCommandPool (getComputeQueue ()->getFamilyIndex (), commandPoolFlags);
297
308
commandPools[i]->createCommandBuffers (IGPUCommandPool::BUFFER_LEVEL::PRIMARY, {commandBuffers.data () + i, 1 }, core::smart_refctd_ptr (m_logger));
298
-
299
- descSets[i] = descPools[i]->createDescriptorSet (core::smart_refctd_ptr (dsLayout[0 ]));
309
+ commandBuffers[i]->setObjectDebugName ((" Histogram Command Buffer #" + std::to_string (i)).c_str ());
300
310
}
301
311
302
312
// LOAD SHADER FROM FILE
@@ -320,7 +330,7 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul
320
330
pc[0 ].size = sizeof (PushConstants);
321
331
322
332
smart_refctd_ptr<nbl::video::IGPUComputePipeline> pipeline;
323
- smart_refctd_ptr<IGPUPipelineLayout> pplnLayout = m_device->createPipelineLayout (pc, smart_refctd_ptr (dsLayout[ 0 ] ));
333
+ smart_refctd_ptr<IGPUPipelineLayout> pplnLayout = m_device->createPipelineLayout (pc,std::move (dsLayout));
324
334
{
325
335
// Nabla actually has facilities for SPIR-V Reflection and "guessing" pipeline layouts for a given SPIR-V which we'll cover in a different example
326
336
if (!pplnLayout)
@@ -369,6 +379,7 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul
369
379
m_histogramBufferMemPtrs[2 ] = m_histogramBufferMemPtrs[1 ] + HISTOGRAM_SIZE;
370
380
}
371
381
382
+ // TODO: will no longer be necessary after separable samplers and images
372
383
IGPUSampler::SParams samplerParams;
373
384
samplerParams.AnisotropicFilter = false ;
374
385
core::smart_refctd_ptr<IGPUSampler> sampler = m_device->createSampler (samplerParams);
@@ -409,17 +420,18 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul
409
420
params.subresourceRange .aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
410
421
params.subresourceRange .layerCount = images[imageToProcessId]->getCreationParameters ().arrayLayers ;
411
422
412
- imgInfo. desc = m_device->createImageView (std::move (params));
413
- if (!imgInfo. desc )
423
+ auto view = m_device->createImageView (std::move (params));
424
+ if (!view )
414
425
logFailAndTerminate (" Couldn't create descriptor." );
426
+ view->setObjectDebugName ((" Image View #" +std::to_string (imageToProcessId)).c_str ());
427
+ imgInfo.desc = std::move (view);
415
428
imgInfo.info .image = { .sampler = sampler, .imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL };
416
429
417
430
IGPUDescriptorSet::SWriteDescriptorSet write[1 ] = {
418
431
{.dstSet = descSets[resourceIdx].get (), .binding = 0 , .arrayElement = 0 , .count = 1 , .info = &imgInfo }
419
432
};
420
433
m_device->updateDescriptorSets (1 , write, 0u , nullptr );
421
434
422
- computeQueue->startCapture ();
423
435
cmdBuff->begin (IGPUCommandBuffer::USAGE::NONE);
424
436
cmdBuff->beginDebugMarker (" My Compute Dispatch" , core::vectorSIMDf (0 , 1 , 0 , 1 ));
425
437
cmdBuff->bindComputePipeline (pipeline.get ());
@@ -450,8 +462,14 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul
450
462
submitInfo[0 ].commandBuffers = cmdBuffSubmitInfo;
451
463
submitInfo[0 ].signalSemaphores = signalSemaphoreSubmitInfo;
452
464
submitInfo[0 ].waitSemaphores = {waitSemaphoreSubmitInfo, imageToProcessId < FRAMES_IN_FLIGHT ? 1u : 2u };
465
+ // Some Devices like all of the Intel GPUs do not have enough queues for us to allocate different queues to compute and transfers,
466
+ // so our `BasicMultiQueueApplication` will "alias" a single queue to both usages. Normally you don't need to care, but here we're
467
+ // attempting to do "out-of-order" "submit-before-signal" so we need to "hold back" submissions if the queues are aliased!
468
+ // TODO: Renderdoc freezes because it starts capturing immediately upon a submit and can't defer a capture till semaphores signal.
469
+ if (getTransferUpQueue ()==computeQueue /* || m_api->isRunningInRenderdoc()*/ )
470
+ for (auto old = transfersSubmitted.load (); old <= imageToProcessId; old = transfersSubmitted.load ())
471
+ transfersSubmitted.wait (old);
453
472
computeQueue->submit (submitInfo);
454
- computeQueue->endCapture ();
455
473
std::string msg = std::string (" Image nr " ) + std::to_string (imageToProcessId) + " processed. Resource idx: " + std::to_string (resourceIdx);
456
474
m_logger->log (msg);
457
475
}
0 commit comments