@@ -211,9 +211,10 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
211
211
auto queue = intendedSubmit.queue ;
212
212
if (!queue)
213
213
{
214
- // TODO: log error
214
+ m_logger. log ( " No queue in the `intendedSubmit`! " ,system::ILogger::ELL_ERROR);
215
215
return IQueue::RESULT::OTHER_ERROR;
216
216
}
217
+
217
218
// backup in-case we need to restore to unmodified state
218
219
SIntendedSubmitInfo patchedSubmit;
219
220
memcpy (&patchedSubmit,&intendedSubmit,sizeof (SIntendedSubmitInfo));
@@ -227,92 +228,65 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
227
228
}
228
229
229
230
// patch the commandbuffers if needed
231
+ core::vector<core::smart_refctd_ptr<IGPUCommandBuffer>> newScratch;
230
232
core::vector<IQueue::SSubmitInfo::SCommandBufferInfo> patchedCmdBufs;
231
- auto patchCmdBuf = [&]()->void {patchedCmdBufs.resize (patchedSubmit.commandBuffers .size ()+1 );};
232
- if (auto * candidateScratch=patchedSubmit.getScratchCommandBuffer (); candidateScratch)
233
- switch (candidateScratch->getState ())
234
- {
235
- case IGPUCommandBuffer::STATE::INITIAL:
236
- case IGPUCommandBuffer::STATE::INVALID:
237
- if (candidateScratch->isResettable () && candidateScratch->begin (IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT))
238
- break ;
239
- patchCmdBuf ();
240
- break ;
241
- case IGPUCommandBuffer::STATE::RECORDING:
242
- if (candidateScratch->isResettable () && candidateScratch->getRecordingFlags ().hasFlags (IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT))
243
- break ;
244
- candidateScratch->end ();
245
- patchCmdBuf ();
246
- break ;
247
- default :
248
- patchCmdBuf ();
249
- break ;
250
- }
251
- else
252
- patchCmdBuf ();
253
-
254
- core::smart_refctd_ptr<IGPUCommandBuffer> newScratch;
255
- if (!patchedCmdBufs.empty ())
233
+ if (patchedSubmit.scratchCommandBuffers .empty ())
256
234
{
257
- // allocate a span one larger than the original
258
- const auto origCmdBufs = patchedSubmit.commandBuffers ;
259
- patchedSubmit.commandBuffers = patchedCmdBufs;
260
- // copy the original commandbuffers
261
- std::copy (origCmdBufs.begin (),origCmdBufs.end (),patchedCmdBufs.begin ());
262
- // create the scratch commandbuffer (the patching)
235
+ constexpr size_t defaultSumbitsInFlight = 8 ;
236
+ newScratch.resize (defaultSumbitsInFlight);
237
+ // create the scratch commandbuffers (the patching)
263
238
{
264
239
auto device = const_cast <ILogicalDevice*>(queue->getOriginDevice ());
265
240
auto pool = device->createCommandPool (queue->getFamilyIndex (),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
266
- if (!pool || !pool->createCommandBuffers (IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{& newScratch, 1 } ))
241
+ if (!pool || !pool->createCommandBuffers (IGPUCommandPool::BUFFER_LEVEL::PRIMARY,newScratch))
267
242
{
268
- // TODO: log error
269
- return IQueue::RESULT::OTHER_ERROR;
270
- }
271
- if (!newScratch->begin (IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT))
272
- {
273
- // TODO: log error
243
+ m_logger.log (" Either couldn't create a command pool or the command buffers!" ,system::ILogger::ELL_ERROR);
274
244
return IQueue::RESULT::OTHER_ERROR;
275
245
}
276
246
}
277
- patchedCmdBufs[origCmdBufs.size ()] = {newScratch.get ()};
278
- patchedSubmit.commandBuffers = patchedCmdBufs;
247
+ // begin
248
+ if (auto cmdbuf=newScratch.front ().get (); !cmdbuf->begin (IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT))
249
+ {
250
+ m_logger.log (" Could not begin command buffer %p" ,system::ILogger::ELL_ERROR,cmdbuf);
251
+ return IQueue::RESULT::OTHER_ERROR;
252
+ }
253
+ // then and fill the info vector
254
+ patchedCmdBufs.reserve (newScratch.size ());
255
+ for (const auto & cmdbuf : newScratch)
256
+ patchedCmdBufs.emplace_back (cmdbuf.get ());
257
+ patchedSubmit.scratchCommandBuffers = patchedCmdBufs;
279
258
}
280
259
281
260
if (!patchedSubmit.valid ())
282
261
{
283
- // TODO: log error
262
+ m_logger. log ( " Even patching failed to create a valid `SIntendedSubmitInfo`! " ,system::ILogger::ELL_ERROR);
284
263
return IQueue::RESULT::OTHER_ERROR;
285
264
}
286
265
287
266
if (!what (patchedSubmit))
288
267
{
289
- // TODO: log error
268
+ m_logger. log ( " Function to `autoSubmit` failed recording/overflowing! " ,system::ILogger::ELL_ERROR);
290
269
return IQueue::RESULT::OTHER_ERROR;
291
270
}
292
271
// no way back now, have to modify the intended submit
293
272
memcpy (&intendedSubmit,&patchedSubmit,sizeof (intendedSubmit));
294
- intendedSubmit.getScratchCommandBuffer ()->end ();
295
-
296
- const auto submit = intendedSubmit.popSubmit (extraSignalSemaphores);
297
- if (newScratch)
298
- intendedSubmit.commandBuffers = {};
273
+ auto finalScratch = intendedSubmit.valid ()->cmdbuf ;
274
+ finalScratch->end ();
275
+ const auto submit = intendedSubmit.popSubmit (finalScratch,extraSignalSemaphores);
276
+ // have to let go of our temporaries
277
+ if (!patchedCmdBufs.empty ())
278
+ intendedSubmit.scratchCommandBuffers = {};
299
279
if (const auto error=queue->submit (submit); error!=IQueue::RESULT::SUCCESS)
300
280
{
301
281
if (patchedSemaphore)
302
- {
303
- intendedSubmit.waitSemaphores = {};
304
282
intendedSubmit.scratchSemaphore = {};
305
- }
306
283
return error;
307
284
}
308
285
309
286
ISemaphore::future_t <IQueue::RESULT> retval (IQueue::RESULT::SUCCESS);
310
287
retval.set ({intendedSubmit.scratchSemaphore .semaphore ,intendedSubmit.scratchSemaphore .value });
311
288
if (patchedSemaphore)
312
- {
313
- intendedSubmit.waitSemaphores = {};
314
289
intendedSubmit.scratchSemaphore = {};
315
- }
316
290
return retval;
317
291
}
318
292
@@ -343,14 +317,14 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
343
317
return false ;
344
318
}
345
319
346
- if (!commonTransferValidation (nextSubmit))
320
+ auto * scratch = commonTransferValidation (nextSubmit);
321
+ if (!scratch)
347
322
return false ;
348
323
349
324
const auto & limits = m_device->getPhysicalDevice ()->getLimits ();
350
325
// TODO: Why did we settle on `/4` ? It definitely wasn't about the uint32_t size!
351
326
const uint32_t optimalTransferAtom = core::min<uint32_t >(limits.maxResidentInvocations *OptimalCoalescedInvocationXferSize,m_defaultUploadBuffer->get_total_size ()/4 );
352
327
353
- auto cmdbuf = nextSubmit.getScratchCommandBuffer ();
354
328
// no pipeline barriers necessary because write and optional flush happens before submit, and memory allocation is reclaimed after fence signal
355
329
for (size_t uploadedSize=0ull ; uploadedSize<bufferRange.size ;)
356
330
{
@@ -359,7 +333,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
359
333
// how large we can make the allocation
360
334
uint32_t maxFreeBlock = m_defaultUploadBuffer.get ()->max_size ();
361
335
// get allocation size
362
- const uint32_t allocationSize = getAllocationSizeForStreamingBuffer (size, m_allocationAlignment, maxFreeBlock, optimalTransferAtom);
336
+ const uint32_t allocationSize = getAllocationSizeForStreamingBuffer (size,m_allocationAlignment,maxFreeBlock,optimalTransferAtom);
363
337
// make sure we dont overrun the destination buffer due to padding
364
338
const uint32_t subSize = core::min (allocationSize,size);
365
339
// cannot use `multi_place` because of the extra padding size we could have added
@@ -373,7 +347,13 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
373
347
}
374
348
else
375
349
{
376
- nextSubmit.overflowSubmit ();
350
+ const auto completed = nextSubmit.getFutureScratchSemaphore ();
351
+ nextSubmit.overflowSubmit (scratch);
352
+ // overflowSubmit no longer blocks for the last submit to have completed, so we must do it ourselves here
353
+ // TODO: if we cleverly overflowed BEFORE completely running out of memory (better heuristics) then we wouldn't need to do this and some CPU-GPU overlap could be achieved
354
+ if (nextSubmit.overflowCallback )
355
+ nextSubmit.overflowCallback (completed);
356
+ m_device->blockForSemaphores ({&completed,1 });
377
357
continue ; // keep trying again
378
358
}
379
359
// some platforms expose non-coherent host-visible GPU memory, so writes need to be flushed explicitly
@@ -387,9 +367,9 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
387
367
copy.srcOffset = localOffset;
388
368
copy.dstOffset = bufferRange.offset +uploadedSize;
389
369
copy.size = subSize;
390
- cmdbuf->copyBuffer (m_defaultUploadBuffer.get ()->getBuffer (), bufferRange.buffer .get (), 1u , ©);
370
+ scratch-> cmdbuf ->copyBuffer (m_defaultUploadBuffer.get ()->getBuffer (), bufferRange.buffer .get (), 1u , ©);
391
371
// this doesn't actually free the memory, the memory is queued up to be freed only after the `scratchSemaphore` reaches a value a future submit will signal
392
- m_defaultUploadBuffer.get ()->multi_deallocate (1u ,&localOffset,&allocationSize,nextSubmit.getFutureScratchSemaphore (),&cmdbuf);
372
+ m_defaultUploadBuffer.get ()->multi_deallocate (1u ,&localOffset,&allocationSize,nextSubmit.getFutureScratchSemaphore (),&scratch-> cmdbuf );
393
373
uploadedSize += subSize;
394
374
}
395
375
return true ;
@@ -520,14 +500,14 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
520
500
return false ;
521
501
}
522
502
523
- if (!commonTransferValidation (nextSubmit))
503
+ auto * scratch = commonTransferValidation (nextSubmit);
504
+ if (!scratch)
524
505
return false ;
525
506
526
507
const auto & limits = m_device->getPhysicalDevice ()->getLimits ();
527
508
// TODO: Why did we settle on `/4` ? It definitely wasn't about the uint32_t size!
528
509
const uint32_t optimalTransferAtom = core::min<uint32_t >(limits.maxResidentInvocations *OptimalCoalescedInvocationXferSize,m_defaultDownloadBuffer->get_total_size ()/4 );
529
510
530
- auto cmdbuf = nextSubmit.getScratchCommandBuffer ();
531
511
// Basically downloadedSize is downloadRecordedIntoCommandBufferSize :D
532
512
for (size_t downloadedSize=0ull ; downloadedSize<srcBufferRange.size ;)
533
513
{
@@ -547,12 +527,12 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
547
527
copy.srcOffset = srcBufferRange.offset + downloadedSize;
548
528
copy.dstOffset = localOffset;
549
529
copy.size = copySize;
550
- cmdbuf->copyBuffer (srcBufferRange.buffer .get (),m_defaultDownloadBuffer->getBuffer (),1u ,©);
530
+ scratch-> cmdbuf ->copyBuffer (srcBufferRange.buffer .get (),m_defaultDownloadBuffer->getBuffer (),1u ,©);
551
531
552
532
auto dataConsumer = core::make_smart_refctd_ptr<CDownstreamingDataConsumer>(
553
533
IDeviceMemoryAllocation::MemoryRange (localOffset,copySize),
554
534
consumeCallback,
555
- core::smart_refctd_ptr<IGPUCommandBuffer>(cmdbuf),
535
+ core::smart_refctd_ptr<IGPUCommandBuffer>(scratch-> cmdbuf ),
556
536
m_defaultDownloadBuffer.get (),
557
537
downloadedSize
558
538
);
@@ -561,7 +541,15 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
561
541
downloadedSize += copySize;
562
542
}
563
543
else // but first sumbit the already buffered up copies
564
- nextSubmit.overflowSubmit ();
544
+ {
545
+ const auto completed = nextSubmit.getFutureScratchSemaphore ();
546
+ nextSubmit.overflowSubmit (scratch);
547
+ // overflowSubmit no longer blocks for the last submit to have completed, so we must do it ourselves here
548
+ // TODO: if we cleverly overflowed BEFORE completely running out of memory (better heuristics) then we wouldn't need to do this and some CPU-GPU overlap could be achieved
549
+ if (nextSubmit.overflowCallback )
550
+ nextSubmit.overflowCallback (completed);
551
+ m_device->blockForSemaphores ({&completed,1 });
552
+ }
565
553
}
566
554
return true ;
567
555
}
@@ -684,23 +672,24 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
684
672
685
673
protected:
686
674
//
687
- inline bool commonTransferValidation (const SIntendedSubmitInfo& intendedNextSubmit)
675
+ inline const IQueue::SSubmitInfo::SCommandBufferInfo* commonTransferValidation (const SIntendedSubmitInfo& intendedNextSubmit)
688
676
{
689
- if (!intendedNextSubmit.valid ())
677
+ auto retval = intendedNextSubmit.valid ();
678
+ if (!retval)
690
679
{
691
680
m_logger.log (" Invalid `intendedNextSubmit`." , nbl::system::ILogger::ELL_ERROR);
692
- return false ;
681
+ return nullptr ;
693
682
}
694
683
695
684
assert (intendedNextSubmit.queue );
696
685
auto queueFamProps = m_device->getPhysicalDevice ()->getQueueFamilyProperties ()[intendedNextSubmit.queue ->getFamilyIndex ()];
697
686
if (!queueFamProps.queueFlags .hasFlags (IQueue::FAMILY_FLAGS::TRANSFER_BIT))
698
687
{
699
688
m_logger.log (" Invalid `intendedNextSubmit.queue` is not capable of transfer operations!" , nbl::system::ILogger::ELL_ERROR);
700
- return false ;
689
+ return nullptr ;
701
690
}
702
691
703
- return true ;
692
+ return retval ;
704
693
}
705
694
706
695
// The application must round down the start of the range to the nearest multiple of VkPhysicalDeviceLimits::nonCoherentAtomSize,
0 commit comments