Skip to content

Commit 4f06f21

Browse files
Merge pull request #322 from Devsh-Graphics-Programming/android
Forced Defragmentation of Streaming Buffer (and small copy prevention)
2 parents 79a03d3 + 43de03e commit 4f06f21

File tree

2 files changed

+38
-13
lines changed

2 files changed

+38
-13
lines changed

include/nbl/core/alloc/GeneralpurposeAddressAllocator.h

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -697,10 +697,27 @@ namespace core
697697

698698
// aliases
699699
template<typename size_type>
700-
using GeneralpurposeAddressAllocatorST = GeneralpurposeAddressAllocator<size_type>;
700+
class GeneralpurposeAddressAllocatorST : public GeneralpurposeAddressAllocator<size_type>
701+
{
702+
public:
703+
inline void defragment() noexcept
704+
{
705+
GeneralpurposeAddressAllocator<size_type>::defragment();
706+
}
707+
};
701708

702709
template<typename size_type, class RecursiveLockable>
703-
using GeneralpurposeAddressAllocatorMT = AddressAllocatorBasicConcurrencyAdaptor<GeneralpurposeAddressAllocator<size_type>,RecursiveLockable>;
710+
class GeneralpurposeAddressAllocatorMT : public AddressAllocatorBasicConcurrencyAdaptor<GeneralpurposeAddressAllocator<size_type>,RecursiveLockable>
711+
{
712+
using Base = AddressAllocatorBasicConcurrencyAdaptor<GeneralpurposeAddressAllocator<size_type>,RecursiveLockable>;
713+
public:
714+
inline void defragment() noexcept
715+
{
716+
Base::get_lock().lock();
717+
GeneralpurposeAddressAllocator<size_type>::defragment();
718+
Base::get_lock().unlock();
719+
}
720+
};
704721

705722
}
706723
}

include/nbl/video/utilities/IUtilities.h

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,7 @@ namespace nbl::video
291291
)
292292
{
293293
const auto& limits = m_device->getPhysicalDevice()->getLimits();
294-
const uint32_t memoryLowerBound = limits.maxResidentInvocations * sizeof(uint32_t);
294+
const uint32_t optimalTransferAtom = limits.maxResidentInvocations*sizeof(uint32_t);
295295
const uint32_t alignment = static_cast<uint32_t>(limits.nonCoherentAtomSize);
296296

297297
auto* cmdpool = cmdbuf->getPool();
@@ -301,15 +301,23 @@ namespace nbl::video
301301
// no pipeline barriers necessary because write and optional flush happens before submit, and memory allocation is reclaimed after fence signal
302302
for (size_t uploadedSize = 0ull; uploadedSize < bufferRange.size;)
303303
{
304-
const uint32_t size = bufferRange.size - uploadedSize;
305-
const uint32_t paddedSize = static_cast<uint32_t>(core::min<uint64_t>(
306-
core::alignDown(m_defaultUploadBuffer.get()->max_size(), alignment),
307-
core::alignUp(size, alignment)
308-
));
309-
const uint32_t subSize = core::min(paddedSize, size);
304+
// how much hasn't been uploaded yet
305+
const size_t size = bufferRange.size-uploadedSize;
306+
// due to coherent flushing atom sizes, we need to pad
307+
const size_t paddedSize = core::alignUp(size,alignment);
308+
// how large we can make the allocation
309+
uint32_t maxFreeBlock = core::alignDown(m_defaultUploadBuffer.get()->max_size(),alignment);
310+
// don't want to be stuck doing tiny copies, better defragment the allocator by forcing an allocation failure
311+
const bool largeEnoughTransfer = maxFreeBlock>=paddedSize || maxFreeBlock>=optimalTransferAtom;
312+
// how big of an allocation we'll make
313+
const uint32_t alllocationSize = static_cast<uint32_t>(core::min<size_t>(
314+
largeEnoughTransfer ? maxFreeBlock:optimalTransferAtom,paddedSize
315+
));
316+
// make sure we dont overrun the destination buffer due to padding
317+
const uint32_t subSize = core::min(alllocationSize,size);
310318
// cannot use `multi_place` because of the extra padding size we could have added
311319
uint32_t localOffset = video::StreamingTransientDataBufferMT<>::invalid_address;
312-
m_defaultUploadBuffer.get()->multi_alloc(std::chrono::high_resolution_clock::now() + std::chrono::microseconds(500u), 1u, &localOffset, &paddedSize, &alignment);
320+
m_defaultUploadBuffer.get()->multi_alloc(std::chrono::high_resolution_clock::now()+std::chrono::microseconds(500u),1u,&localOffset,&alllocationSize,&alignment);
313321
// copy only the unpadded part
314322
if (localOffset != video::StreamingTransientDataBufferMT<>::invalid_address)
315323
{
@@ -346,8 +354,8 @@ namespace nbl::video
346354
// some platforms expose non-coherent host-visible GPU memory, so writes need to be flushed explicitly
347355
if (m_defaultUploadBuffer.get()->needsManualFlushOrInvalidate())
348356
{
349-
IDriverMemoryAllocation::MappedMemoryRange flushRange(m_defaultUploadBuffer.get()->getBuffer()->getBoundMemory(), localOffset, paddedSize);
350-
m_device->flushMappedMemoryRanges(1u, &flushRange);
357+
IDriverMemoryAllocation::MappedMemoryRange flushRange(m_defaultUploadBuffer.get()->getBuffer()->getBoundMemory(),localOffset,alllocationSize);
358+
m_device->flushMappedMemoryRanges(1u,&flushRange);
351359
}
352360
// after we make sure writes are in GPU memory (visible to GPU) and not still in a cache, we can copy using the GPU to device-only memory
353361
asset::SBufferCopy copy;
@@ -356,7 +364,7 @@ namespace nbl::video
356364
copy.size = subSize;
357365
cmdbuf->copyBuffer(m_defaultUploadBuffer.get()->getBuffer(), bufferRange.buffer.get(), 1u, &copy);
358366
// this doesn't actually free the memory, the memory is queued up to be freed only after the GPU fence/event is signalled
359-
m_defaultUploadBuffer.get()->multi_free(1u, &localOffset, &paddedSize, core::smart_refctd_ptr<IGPUFence>(fence), &cmdbuf); // can queue with a reset but not yet pending fence, just fine
367+
m_defaultUploadBuffer.get()->multi_free(1u,&localOffset,&alllocationSize,core::smart_refctd_ptr<IGPUFence>(fence),&cmdbuf); // can queue with a reset but not yet pending fence, just fine
360368
uploadedSize += subSize;
361369
}
362370
}

0 commit comments

Comments
 (0)