@@ -291,7 +291,7 @@ namespace nbl::video
291
291
)
292
292
{
293
293
const auto & limits = m_device->getPhysicalDevice ()->getLimits ();
294
- const uint32_t memoryLowerBound = limits.maxResidentInvocations * sizeof (uint32_t );
294
+ const uint32_t optimalTransferAtom = limits.maxResidentInvocations * sizeof (uint32_t );
295
295
const uint32_t alignment = static_cast <uint32_t >(limits.nonCoherentAtomSize );
296
296
297
297
auto * cmdpool = cmdbuf->getPool ();
@@ -301,15 +301,23 @@ namespace nbl::video
301
301
// no pipeline barriers necessary because write and optional flush happens before submit, and memory allocation is reclaimed after fence signal
302
302
for (size_t uploadedSize = 0ull ; uploadedSize < bufferRange.size ;)
303
303
{
304
- const uint32_t size = bufferRange.size - uploadedSize;
305
- const uint32_t paddedSize = static_cast <uint32_t >(core::min<uint64_t >(
306
- core::alignDown (m_defaultUploadBuffer.get ()->max_size (), alignment),
307
- core::alignUp (size, alignment)
308
- ));
309
- const uint32_t subSize = core::min (paddedSize, size);
304
+ // how much hasn't been uploaded yet
305
+ const size_t size = bufferRange.size -uploadedSize;
306
+ // due to coherent flushing atom sizes, we need to pad
307
+ const size_t paddedSize = core::alignUp (size,alignment);
308
+ // how large we can make the allocation
309
+ uint32_t maxFreeBlock = core::alignDown (m_defaultUploadBuffer.get ()->max_size (),alignment);
310
+ // don't want to be stuck doing tiny copies, better defragment the allocator by forcing an allocation failure
311
+ const bool largeEnoughTransfer = maxFreeBlock>=paddedSize || maxFreeBlock>=optimalTransferAtom;
312
+ // how big of an allocation we'll make
313
+ const uint32_t alllocationSize = static_cast <uint32_t >(core::min<size_t >(
314
+ largeEnoughTransfer ? maxFreeBlock:optimalTransferAtom,paddedSize
315
+ ));
316
+ // make sure we dont overrun the destination buffer due to padding
317
+ const uint32_t subSize = core::min (alllocationSize,size);
310
318
// cannot use `multi_place` because of the extra padding size we could have added
311
319
uint32_t localOffset = video::StreamingTransientDataBufferMT<>::invalid_address;
312
- m_defaultUploadBuffer.get ()->multi_alloc (std::chrono::high_resolution_clock::now () + std::chrono::microseconds (500u ), 1u , &localOffset, &paddedSize, &alignment);
320
+ m_defaultUploadBuffer.get ()->multi_alloc (std::chrono::high_resolution_clock::now ()+ std::chrono::microseconds (500u ),1u ,&localOffset,&alllocationSize, &alignment);
313
321
// copy only the unpadded part
314
322
if (localOffset != video::StreamingTransientDataBufferMT<>::invalid_address)
315
323
{
@@ -346,8 +354,8 @@ namespace nbl::video
346
354
// some platforms expose non-coherent host-visible GPU memory, so writes need to be flushed explicitly
347
355
if (m_defaultUploadBuffer.get ()->needsManualFlushOrInvalidate ())
348
356
{
349
- IDriverMemoryAllocation::MappedMemoryRange flushRange (m_defaultUploadBuffer.get ()->getBuffer ()->getBoundMemory (), localOffset, paddedSize );
350
- m_device->flushMappedMemoryRanges (1u , &flushRange);
357
+ IDriverMemoryAllocation::MappedMemoryRange flushRange (m_defaultUploadBuffer.get ()->getBuffer ()->getBoundMemory (),localOffset,alllocationSize );
358
+ m_device->flushMappedMemoryRanges (1u ,&flushRange);
351
359
}
352
360
// after we make sure writes are in GPU memory (visible to GPU) and not still in a cache, we can copy using the GPU to device-only memory
353
361
asset::SBufferCopy copy;
@@ -356,7 +364,7 @@ namespace nbl::video
356
364
copy.size = subSize;
357
365
cmdbuf->copyBuffer (m_defaultUploadBuffer.get ()->getBuffer (), bufferRange.buffer .get (), 1u , ©);
358
366
// this doesn't actually free the memory, the memory is queued up to be freed only after the GPU fence/event is signalled
359
- m_defaultUploadBuffer.get ()->multi_free (1u , &localOffset, &paddedSize, core::smart_refctd_ptr<IGPUFence>(fence), &cmdbuf); // can queue with a reset but not yet pending fence, just fine
367
+ m_defaultUploadBuffer.get ()->multi_free (1u ,&localOffset,&alllocationSize, core::smart_refctd_ptr<IGPUFence>(fence),&cmdbuf); // can queue with a reset but not yet pending fence, just fine
360
368
uploadedSize += subSize;
361
369
}
362
370
}
0 commit comments