Skip to content

Commit a416a0c

Browse files
author
devsh
committed
buffer up the memory flushes whenever possible, also factor out the mapped memory align function out of IUtilities because its useful
1 parent ef011fe commit a416a0c

File tree

5 files changed

+52
-24
lines changed

5 files changed

+52
-24
lines changed

include/nbl/video/IDeviceMemoryAllocation.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted
9191
//! Utility function, tell us if writes by the CPU or GPU need extra visibility operations to become visible for reading on the other processor
9292
/** Only execute flushes or invalidations if the allocation requires them, and batch them (flush one combined range instead of two or more)
9393
for greater efficiency. To execute a flush or invalidation, use IDriver::flushMappedAllocationRanges and IDriver::invalidateMappedAllocationRanges respectively. */
94+
// TODO: Visible is a misnomer, collides with Vulkan memory model nomenclature where visibility only concerns reads, where as this is both read and write (visibility and availability)
9495
inline bool haveToMakeVisible() const
9596
{
9697
return !m_memoryPropertyFlags.hasFlags(EMPF_HOST_COHERENT_BIT);
@@ -102,6 +103,9 @@ class IDeviceMemoryAllocation : public virtual core::IReferenceCounted
102103
size_t offset = 0ull;
103104
size_t length = 0ull;
104105
};
106+
// makes sure offset and length are aligned to the `SPhysicalDeviceLimits::nonCoherentAtomSize` but also not outside the memory allocation
107+
MemoryRange alignNonCoherentRange(MemoryRange range) const;
108+
//
105109
inline void* map(const MemoryRange& range, const core::bitflag<E_MAPPING_CPU_ACCESS_FLAGS> accessHint=IDeviceMemoryAllocation::EMCAF_READ_AND_WRITE)
106110
{
107111
if (isCurrentlyMapped())

include/nbl/video/ILogicalDevice.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,8 +202,12 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
202202
//! Similar to VkMappedMemoryRange but no pNext
203203
struct MappedMemoryRange
204204
{
205+
struct align_non_coherent_tag_t {};
206+
constexpr static inline align_non_coherent_tag_t align_non_coherent_tag = {};
207+
205208
MappedMemoryRange() : memory(nullptr), range{} {}
206-
MappedMemoryRange(IDeviceMemoryAllocation* mem, const size_t& off, const size_t& len) : memory(mem), range{off,len} {}
209+
MappedMemoryRange(IDeviceMemoryAllocation* mem, const size_t off, const size_t len) : memory(mem), range{off,len} {}
210+
MappedMemoryRange(IDeviceMemoryAllocation* mem, const size_t off, const size_t len, const align_non_coherent_tag_t) : memory(mem), range(mem->alignNonCoherentRange({off,len})) {}
207211

208212
inline bool valid() const
209213
{

include/nbl/video/utilities/IUtilities.h

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,12 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
354354
const uint32_t optimalTransferAtom = core::min<uint32_t>(limits.maxResidentInvocations*OptimalCoalescedInvocationXferSize,m_defaultUploadBuffer->get_total_size()/4);
355355
const auto minBlockSize = m_defaultUploadBuffer->getAddressAllocator().min_size();
356356

357+
core::vector<ILogicalDevice::MappedMemoryRange> flushRanges;
358+
const bool manualFlush = m_defaultUploadBuffer.get()->needsManualFlushOrInvalidate();
359+
if (manualFlush)
360+
flushRanges.reserve((bufferRange.size-1)/m_defaultUploadBuffer.get()->max_size()+1);
361+
362+
auto* uploadBuffer = m_defaultUploadBuffer.get()->getBuffer();
357363
// no pipeline barriers necessary because write and optional flush happens before submit, and memory allocation is reclaimed after fence signal
358364
for (size_t uploadedSize=0ull; uploadedSize<bufferRange.size;)
359365
{
@@ -386,6 +392,11 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
386392
}
387393
else
388394
{
395+
if (!flushRanges.empty())
396+
{
397+
m_device->flushMappedMemoryRanges(flushRanges);
398+
flushRanges.clear();
399+
}
389400
const auto completed = nextSubmit.getFutureScratchSemaphore();
390401
nextSubmit.overflowSubmit(scratch);
391402
// overflowSubmit no longer blocks for the last submit to have completed, so we must do it ourselves here
@@ -396,21 +407,20 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
396407
continue; // keep trying again
397408
}
398409
// some platforms expose non-coherent host-visible GPU memory, so writes need to be flushed explicitly
399-
if (m_defaultUploadBuffer.get()->needsManualFlushOrInvalidate())
400-
{
401-
auto flushRange = AlignedMappedMemoryRange(m_defaultUploadBuffer.get()->getBuffer()->getBoundMemory().memory,localOffset,subSize,limits.nonCoherentAtomSize);
402-
m_device->flushMappedMemoryRanges(1u,&flushRange);
403-
}
410+
if (manualFlush)
411+
flushRanges.emplace_back(uploadBuffer->getBoundMemory().memory,localOffset,subSize,ILogicalDevice::MappedMemoryRange::align_non_coherent_tag);
404412
// after we make sure writes are in GPU memory (visible to GPU) and not still in a cache, we can copy using the GPU to device-only memory
405413
IGPUCommandBuffer::SBufferCopy copy;
406414
copy.srcOffset = localOffset;
407415
copy.dstOffset = bufferRange.offset+uploadedSize;
408416
copy.size = subSize;
409-
scratch->cmdbuf->copyBuffer(m_defaultUploadBuffer.get()->getBuffer(), bufferRange.buffer.get(), 1u, &copy);
417+
scratch->cmdbuf->copyBuffer(uploadBuffer, bufferRange.buffer.get(), 1u, &copy);
410418
// this doesn't actually free the memory, the memory is queued up to be freed only after the `scratchSemaphore` reaches a value a future submit will signal
411419
m_defaultUploadBuffer.get()->multi_deallocate(1u,&localOffset,&allocationSize,nextSubmit.getFutureScratchSemaphore(),&scratch->cmdbuf);
412420
uploadedSize += subSize;
413421
}
422+
if (!flushRanges.empty())
423+
m_device->flushMappedMemoryRanges(flushRanges);
414424
return true;
415425
}
416426
// overload to make invokers not care about l-value or r-value
@@ -530,11 +540,12 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
530540
~CDownstreamingDataConsumer()
531541
{
532542
assert(m_downstreamingBuffer);
533-
auto device = const_cast<ILogicalDevice*>(m_downstreamingBuffer->getBuffer()->getOriginDevice());
543+
auto* downstreamingBuffer = m_downstreamingBuffer->getBuffer();
544+
auto device = const_cast<ILogicalDevice*>(downstreamingBuffer->getOriginDevice());
534545
if (m_downstreamingBuffer->needsManualFlushOrInvalidate())
535546
{
536547
const auto nonCoherentAtomSize = device->getPhysicalDevice()->getLimits().nonCoherentAtomSize;
537-
auto flushRange = AlignedMappedMemoryRange(m_downstreamingBuffer->getBuffer()->getBoundMemory().memory,m_copyRange.offset,m_copyRange.length,nonCoherentAtomSize);
548+
auto flushRange = ILogicalDevice::MappedMemoryRange(downstreamingBuffer->getBoundMemory().memory,m_copyRange.offset,m_copyRange.length,ILogicalDevice::MappedMemoryRange::align_non_coherent_tag);
538549
device->invalidateMappedMemoryRanges(1u,&flushRange);
539550
}
540551
// Call the function
@@ -744,17 +755,6 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
744755
return retval;
745756
}
746757

747-
// The application must round down the start of the range to the nearest multiple of VkPhysicalDeviceLimits::nonCoherentAtomSize,
748-
// and round the end of the range up to the nearest multiple of VkPhysicalDeviceLimits::nonCoherentAtomSize.
749-
static ILogicalDevice::MappedMemoryRange AlignedMappedMemoryRange(IDeviceMemoryAllocation* mem, const size_t& off, const size_t& len, size_t nonCoherentAtomSize)
750-
{
751-
ILogicalDevice::MappedMemoryRange range = {};
752-
range.memory = mem;
753-
range.offset = core::alignDown(off, nonCoherentAtomSize);
754-
range.length = core::min(core::alignUp(len, nonCoherentAtomSize), mem->getAllocationSize());
755-
return range;
756-
}
757-
758758

759759
core::smart_refctd_ptr<ILogicalDevice> m_device;
760760

src/nbl/video/IDeviceMemoryAllocation.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,12 @@ E_API_TYPE IDeviceMemoryAllocation::getAPIType() const
1010
return m_originDevice->getAPIType();
1111
}
1212

13+
IDeviceMemoryAllocation::MemoryRange IDeviceMemoryAllocation::alignNonCoherentRange(MemoryRange range) const
14+
{
15+
const auto alignment = m_originDevice->getPhysicalDevice()->getLimits().nonCoherentAtomSize;
16+
range.offset = core::alignDown(range.offset,alignment);
17+
range.length = core::min(core::alignUp(range.length,alignment),m_allocationSize);
18+
return range;
19+
}
20+
1321
}

src/nbl/video/utilities/IUtilities.cpp

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,12 @@ bool IUtilities::updateImageViaStagingBuffer(
7373

7474
regionsToCopy.reserve(maxIterations);
7575

76+
core::vector<ILogicalDevice::MappedMemoryRange> flushRanges;
77+
const bool manualFlush = m_defaultUploadBuffer.get()->needsManualFlushOrInvalidate();
78+
if (manualFlush)
79+
flushRanges.reserve(maxIterations);
80+
81+
auto* uploadBuffer = m_defaultUploadBuffer.get()->getBuffer();
7682
while (!regionIterator.isFinished())
7783
{
7884
size_t memoryNeededForRemainingRegions = regionIterator.getMemoryNeededForRemainingRegions();
@@ -95,6 +101,11 @@ bool IUtilities::updateImageViaStagingBuffer(
95101
// keep trying again
96102
if (failedAllocation)
97103
{
104+
if (!flushRanges.empty())
105+
{
106+
m_device->flushMappedMemoryRanges(flushRanges);
107+
flushRanges.clear();
108+
}
98109
const auto completed = intendedNextSubmit.getFutureScratchSemaphore();
99110
intendedNextSubmit.overflowSubmit(scratch);
100111
// overflowSubmit no longer blocks for the last submit to have completed, so we must do it ourselves here
@@ -123,22 +134,23 @@ bool IUtilities::updateImageViaStagingBuffer(
123134
}
124135

125136
if (!regionsToCopy.empty())
126-
scratch->cmdbuf->copyBufferToImage(m_defaultUploadBuffer.get()->getBuffer(), dstImage, currentDstImageLayout, regionsToCopy.size(), regionsToCopy.data());
137+
scratch->cmdbuf->copyBufferToImage(uploadBuffer, dstImage, currentDstImageLayout, regionsToCopy.size(), regionsToCopy.data());
127138

128139
assert(!regionsToCopy.empty() && "allocationSize is not enough to support the smallest possible transferable units to image, may be caused if your queueFam's minImageTransferGranularity is large or equal to <0,0,0>.");
129140

130141
// some platforms expose non-coherent host-visible GPU memory, so writes need to be flushed explicitly
131-
if (m_defaultUploadBuffer.get()->needsManualFlushOrInvalidate())
142+
if (manualFlush)
132143
{
133144
const auto consumedMemory = allocationSize - availableUploadBufferMemory;
134-
auto flushRange = AlignedMappedMemoryRange(m_defaultUploadBuffer.get()->getBuffer()->getBoundMemory().memory, localOffset, consumedMemory, limits.nonCoherentAtomSize);
135-
m_device->flushMappedMemoryRanges(1u, &flushRange);
145+
flushRanges.emplace_back(uploadBuffer->getBoundMemory().memory, localOffset, consumedMemory, ILogicalDevice::MappedMemoryRange::align_non_coherent_tag);
136146
}
137147
}
138148

139149
// this doesn't actually free the memory, the memory is queued up to be freed only after the GPU fence/event is signalled
140150
m_defaultUploadBuffer.get()->multi_deallocate(1u,&localOffset,&allocationSize,intendedNextSubmit.getFutureScratchSemaphore()); // can queue with a reset but not yet pending fence, just fine
141151
}
152+
if (!flushRanges.empty())
153+
m_device->flushMappedMemoryRanges(flushRanges);
142154
return true;
143155
}
144156

0 commit comments

Comments
 (0)