Skip to content

Commit 487238d

Browse files
devshkeptsecret
authored andcommitted
fix a possible sync bug due to wrong scratch semaphore signal stage
1 parent d1106f9 commit 487238d

File tree

3 files changed

+20
-0
lines changed

3 files changed

+20
-0
lines changed

include/nbl/video/IDeviceMemoryBacked.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ class IDeviceMemoryBacked : public IBackendObject
5050
{
5151
if (!isConcurrentSharing())
5252
return true;
53+
assert(queueFamilyIndices);
5354
for (uint8_t f=0; f<queueFamilyIndexCount; f++)
5455
if (queueFamilyIndices[f]==family)
5556
return true;

include/nbl/video/utilities/IUtilities.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,8 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
359359
if (manualFlush)
360360
flushRanges.reserve((bufferRange.size-1)/m_defaultUploadBuffer.get()->max_size()+1);
361361

362+
const auto oldScratchStage = nextSubmit.scratchSemaphore.stageMask;
363+
//
362364
auto* uploadBuffer = m_defaultUploadBuffer.get()->getBuffer();
363365
// no pipeline barriers necessary because write and optional flush happens before submit, and memory allocation is reclaimed after fence signal
364366
for (size_t uploadedSize=0ull; uploadedSize<bufferRange.size;)
@@ -398,7 +400,11 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
398400
flushRanges.clear();
399401
}
400402
const auto completed = nextSubmit.getFutureScratchSemaphore();
403+
// for the signal to be useful for us to let go of memory, we need to signal after transfer is finished
404+
nextSubmit.scratchSemaphore.stageMask |= asset::PIPELINE_STAGE_FLAGS::COPY_BIT;
401405
nextSubmit.overflowSubmit(scratch);
406+
// first submit we respect whatever stages the user had (maybe they wanted to be notified of the completion of `nextSubmit.prevCommandBuffers`
407+
nextSubmit.scratchSemaphore.stageMask = {};
402408
// overflowSubmit no longer blocks for the last submit to have completed, so we must do it ourselves here
403409
// TODO: if we cleverly overflowed BEFORE completely running out of memory (better heuristics) then we wouldn't need to do this and some CPU-GPU overlap could be achieved
404410
if (nextSubmit.overflowCallback)
@@ -419,6 +425,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
419425
m_defaultUploadBuffer.get()->multi_deallocate(1u,&localOffset,&allocationSize,nextSubmit.getFutureScratchSemaphore(),&scratch->cmdbuf);
420426
uploadedSize += subSize;
421427
}
428+
nextSubmit.scratchSemaphore.stageMask = oldScratchStage;
422429
if (!flushRanges.empty())
423430
m_device->flushMappedMemoryRanges(flushRanges);
424431
return true;
@@ -600,6 +607,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
600607
// TODO: Why did we settle on `/4` ? It definitely wasn't about the uint32_t size!
601608
const uint32_t optimalTransferAtom = core::min<uint32_t>(limits.maxResidentInvocations*OptimalCoalescedInvocationXferSize,m_defaultDownloadBuffer->get_total_size()/4);
602609

610+
const auto oldScratchStage = nextSubmit.scratchSemaphore.stageMask;
603611
// Basically downloadedSize is downloadRecordedIntoCommandBufferSize :D
604612
for (size_t downloadedSize=0ull; downloadedSize<srcBufferRange.size;)
605613
{
@@ -635,14 +643,19 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
635643
else // but first sumbit the already buffered up copies
636644
{
637645
const auto completed = nextSubmit.getFutureScratchSemaphore();
646+
// for the signal to be useful for us to execute the data consumer callback, the signal must happen after the copy is done
647+
nextSubmit.scratchSemaphore.stageMask |= asset::PIPELINE_STAGE_FLAGS::COPY_BIT;
638648
nextSubmit.overflowSubmit(scratch);
649+
// first submit we respect whatever stages the user had (maybe they wanted to be notified of the completion of `nextSubmit.prevCommandBuffers`
650+
nextSubmit.scratchSemaphore.stageMask = {};
639651
// overflowSubmit no longer blocks for the last submit to have completed, so we must do it ourselves here
640652
// TODO: if we cleverly overflowed BEFORE completely running out of memory (better heuristics) then we wouldn't need to do this and some CPU-GPU overlap could be achieved
641653
if (nextSubmit.overflowCallback)
642654
nextSubmit.overflowCallback(completed);
643655
m_device->blockForSemaphores({&completed,1});
644656
}
645657
}
658+
nextSubmit.scratchSemaphore.stageMask = oldScratchStage;
646659
return true;
647660
}
648661

src/nbl/video/utilities/IUtilities.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ bool IUtilities::updateImageViaStagingBuffer(
7979
flushRanges.reserve(maxIterations);
8080

8181
auto* uploadBuffer = m_defaultUploadBuffer.get()->getBuffer();
82+
const auto oldScratchStage = intendedNextSubmit.scratchSemaphore.stageMask;
8283
while (!regionIterator.isFinished())
8384
{
8485
size_t memoryNeededForRemainingRegions = regionIterator.getMemoryNeededForRemainingRegions();
@@ -107,7 +108,11 @@ bool IUtilities::updateImageViaStagingBuffer(
107108
flushRanges.clear();
108109
}
109110
const auto completed = intendedNextSubmit.getFutureScratchSemaphore();
111+
// for the signal to be useful for us to let go of memory, we need to signal after transfer is finished
112+
intendedNextSubmit.scratchSemaphore.stageMask |= asset::PIPELINE_STAGE_FLAGS::COPY_BIT;
110113
intendedNextSubmit.overflowSubmit(scratch);
114+
// first submit we respect whatever stages the user had (maybe they wanted to be notified of the completion of `nextSubmit.prevCommandBuffers`
115+
intendedNextSubmit.scratchSemaphore.stageMask = {};
111116
// overflowSubmit no longer blocks for the last submit to have completed, so we must do it ourselves here
112117
// TODO: if we cleverly overflowed BEFORE completely running out of memory (better heuristics) then we wouldn't need to do this and some CPU-GPU overlap could be achieved
113118
if (intendedNextSubmit.overflowCallback)
@@ -149,6 +154,7 @@ bool IUtilities::updateImageViaStagingBuffer(
149154
// this doesn't actually free the memory, the memory is queued up to be freed only after the GPU fence/event is signalled
150155
m_defaultUploadBuffer.get()->multi_deallocate(1u,&localOffset,&allocationSize,intendedNextSubmit.getFutureScratchSemaphore()); // can queue with a reset but not yet pending fence, just fine
151156
}
157+
intendedNextSubmit.scratchSemaphore.stageMask = oldScratchStage;
152158
if (!flushRanges.empty())
153159
m_device->flushMappedMemoryRanges(flushRanges);
154160
return true;

0 commit comments

Comments
 (0)