Skip to content

Commit 2f43bba

Browse files
committed
Merge branch 'asset-conversion-v3' into cad_large_texture_streaming
# Conflicts: # examples_tests
2 parents 04dd049 + a9bd557 commit 2f43bba

File tree

8 files changed

+204
-664
lines changed

8 files changed

+204
-664
lines changed

include/nbl/asset/IImageView.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ class IImageView : public IImageViewBase
123123
// declared some usages but they are not a subset
124124
{
125125
// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkImageViewCreateInfo.html#VUID-VkImageViewCreateInfo-pNext-02663
126-
if (subresourceRange.aspectMask.hasFlags(IImage::EAF_STENCIL_BIT) && !imgParams.stencilUsage.hasFlags(_params.subUsages))
126+
if (subresourceRange.aspectMask.hasFlags(IImage::EAF_STENCIL_BIT) && !imgParams.actualStencilUsage().hasFlags(_params.subUsages))
127127
return false;
128128
// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkImageViewCreateInfo.html#VUID-VkImageViewCreateInfo-pNext-02664
129129
if ((subresourceRange.aspectMask.value&(~IImage::EAF_STENCIL_BIT)) && !imgParams.usage.hasFlags(_params.subUsages))

include/nbl/core/util/bitflag.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ struct bitflag final
3636
constexpr bool operator!=(const bitflag<ENUM_TYPE> rhs) const {return value!=rhs.value;}
3737
constexpr bool operator==(const bitflag<ENUM_TYPE> rhs) const {return value==rhs.value;}
3838
constexpr bool hasFlags(const bitflag<ENUM_TYPE> val) const {return (static_cast<UNDERLYING_TYPE>(value) & static_cast<UNDERLYING_TYPE>(val.value)) == static_cast<UNDERLYING_TYPE>(val.value);}
39+
constexpr bool hasAnyFlag(const bitflag<ENUM_TYPE> val) const {return (static_cast<UNDERLYING_TYPE>(value) & static_cast<UNDERLYING_TYPE>(val.value)) != static_cast<UNDERLYING_TYPE>(0);}
3940
};
4041

4142
template<typename T, typename Dummy>

include/nbl/video/utilities/IGPUObjectFromAssetConverter.h

Lines changed: 0 additions & 517 deletions
Large diffs are not rendered by default.

include/nbl/video/utilities/IUtilities.h

Lines changed: 59 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -211,9 +211,10 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
211211
auto queue = intendedSubmit.queue;
212212
if (!queue)
213213
{
214-
// TODO: log error
214+
m_logger.log("No queue in the `intendedSubmit`!",system::ILogger::ELL_ERROR);
215215
return IQueue::RESULT::OTHER_ERROR;
216216
}
217+
217218
// backup in-case we need to restore to unmodified state
218219
SIntendedSubmitInfo patchedSubmit;
219220
memcpy(&patchedSubmit,&intendedSubmit,sizeof(SIntendedSubmitInfo));
@@ -227,92 +228,65 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
227228
}
228229

229230
// patch the commandbuffers if needed
231+
core::vector<core::smart_refctd_ptr<IGPUCommandBuffer>> newScratch;
230232
core::vector<IQueue::SSubmitInfo::SCommandBufferInfo> patchedCmdBufs;
231-
auto patchCmdBuf = [&]()->void{patchedCmdBufs.resize(patchedSubmit.commandBuffers.size()+1);};
232-
if (auto* candidateScratch=patchedSubmit.getScratchCommandBuffer(); candidateScratch)
233-
switch(candidateScratch->getState())
234-
{
235-
case IGPUCommandBuffer::STATE::INITIAL:
236-
case IGPUCommandBuffer::STATE::INVALID:
237-
if (candidateScratch->isResettable() && candidateScratch->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT))
238-
break;
239-
patchCmdBuf();
240-
break;
241-
case IGPUCommandBuffer::STATE::RECORDING:
242-
if (candidateScratch->isResettable() && candidateScratch->getRecordingFlags().hasFlags(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT))
243-
break;
244-
candidateScratch->end();
245-
patchCmdBuf();
246-
break;
247-
default:
248-
patchCmdBuf();
249-
break;
250-
}
251-
else
252-
patchCmdBuf();
253-
254-
core::smart_refctd_ptr<IGPUCommandBuffer> newScratch;
255-
if (!patchedCmdBufs.empty())
233+
if (patchedSubmit.scratchCommandBuffers.empty())
256234
{
257-
// allocate a span one larger than the original
258-
const auto origCmdBufs = patchedSubmit.commandBuffers;
259-
patchedSubmit.commandBuffers = patchedCmdBufs;
260-
// copy the original commandbuffers
261-
std::copy(origCmdBufs.begin(),origCmdBufs.end(),patchedCmdBufs.begin());
262-
// create the scratch commandbuffer (the patching)
235+
constexpr size_t defaultSumbitsInFlight = 8;
236+
newScratch.resize(defaultSumbitsInFlight);
237+
// create the scratch commandbuffers (the patching)
263238
{
264239
auto device = const_cast<ILogicalDevice*>(queue->getOriginDevice());
265240
auto pool = device->createCommandPool(queue->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
266-
if (!pool || !pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&newScratch,1}))
241+
if (!pool || !pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,newScratch))
267242
{
268-
// TODO: log error
269-
return IQueue::RESULT::OTHER_ERROR;
270-
}
271-
if (!newScratch->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT))
272-
{
273-
// TODO: log error
243+
m_logger.log("Either couldn't create a command pool or the command buffers!",system::ILogger::ELL_ERROR);
274244
return IQueue::RESULT::OTHER_ERROR;
275245
}
276246
}
277-
patchedCmdBufs[origCmdBufs.size()] = {newScratch.get()};
278-
patchedSubmit.commandBuffers = patchedCmdBufs;
247+
// begin
248+
if (auto cmdbuf=newScratch.front().get(); !cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT))
249+
{
250+
m_logger.log("Could not begin command buffer %p",system::ILogger::ELL_ERROR,cmdbuf);
251+
return IQueue::RESULT::OTHER_ERROR;
252+
}
253+
// then and fill the info vector
254+
patchedCmdBufs.reserve(newScratch.size());
255+
for (const auto& cmdbuf : newScratch)
256+
patchedCmdBufs.emplace_back(cmdbuf.get());
257+
patchedSubmit.scratchCommandBuffers = patchedCmdBufs;
279258
}
280259

281260
if (!patchedSubmit.valid())
282261
{
283-
// TODO: log error
262+
m_logger.log("Even patching failed to create a valid `SIntendedSubmitInfo`!",system::ILogger::ELL_ERROR);
284263
return IQueue::RESULT::OTHER_ERROR;
285264
}
286265

287266
if (!what(patchedSubmit))
288267
{
289-
// TODO: log error
268+
m_logger.log("Function to `autoSubmit` failed recording/overflowing!",system::ILogger::ELL_ERROR);
290269
return IQueue::RESULT::OTHER_ERROR;
291270
}
292271
// no way back now, have to modify the intended submit
293272
memcpy(&intendedSubmit,&patchedSubmit,sizeof(intendedSubmit));
294-
intendedSubmit.getScratchCommandBuffer()->end();
295-
296-
const auto submit = intendedSubmit.popSubmit(extraSignalSemaphores);
297-
if (newScratch)
298-
intendedSubmit.commandBuffers = {};
273+
auto finalScratch = intendedSubmit.valid()->cmdbuf;
274+
finalScratch->end();
275+
const auto submit = intendedSubmit.popSubmit(finalScratch,extraSignalSemaphores);
276+
// have to let go of our temporaries
277+
if (!patchedCmdBufs.empty())
278+
intendedSubmit.scratchCommandBuffers = {};
299279
if (const auto error=queue->submit(submit); error!=IQueue::RESULT::SUCCESS)
300280
{
301281
if (patchedSemaphore)
302-
{
303-
intendedSubmit.waitSemaphores = {};
304282
intendedSubmit.scratchSemaphore = {};
305-
}
306283
return error;
307284
}
308285

309286
ISemaphore::future_t<IQueue::RESULT> retval(IQueue::RESULT::SUCCESS);
310287
retval.set({intendedSubmit.scratchSemaphore.semaphore,intendedSubmit.scratchSemaphore.value});
311288
if (patchedSemaphore)
312-
{
313-
intendedSubmit.waitSemaphores = {};
314289
intendedSubmit.scratchSemaphore = {};
315-
}
316290
return retval;
317291
}
318292

@@ -343,14 +317,14 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
343317
return false;
344318
}
345319

346-
if (!commonTransferValidation(nextSubmit))
320+
auto* scratch = commonTransferValidation(nextSubmit);
321+
if (!scratch)
347322
return false;
348323

349324
const auto& limits = m_device->getPhysicalDevice()->getLimits();
350325
// TODO: Why did we settle on `/4` ? It definitely wasn't about the uint32_t size!
351326
const uint32_t optimalTransferAtom = core::min<uint32_t>(limits.maxResidentInvocations*OptimalCoalescedInvocationXferSize,m_defaultUploadBuffer->get_total_size()/4);
352327

353-
auto cmdbuf = nextSubmit.getScratchCommandBuffer();
354328
// no pipeline barriers necessary because write and optional flush happens before submit, and memory allocation is reclaimed after fence signal
355329
for (size_t uploadedSize=0ull; uploadedSize<bufferRange.size;)
356330
{
@@ -359,7 +333,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
359333
// how large we can make the allocation
360334
uint32_t maxFreeBlock = m_defaultUploadBuffer.get()->max_size();
361335
// get allocation size
362-
const uint32_t allocationSize = getAllocationSizeForStreamingBuffer(size, m_allocationAlignment, maxFreeBlock, optimalTransferAtom);
336+
const uint32_t allocationSize = getAllocationSizeForStreamingBuffer(size,m_allocationAlignment,maxFreeBlock,optimalTransferAtom);
363337
// make sure we dont overrun the destination buffer due to padding
364338
const uint32_t subSize = core::min(allocationSize,size);
365339
// cannot use `multi_place` because of the extra padding size we could have added
@@ -373,7 +347,13 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
373347
}
374348
else
375349
{
376-
nextSubmit.overflowSubmit();
350+
const auto completed = nextSubmit.getFutureScratchSemaphore();
351+
nextSubmit.overflowSubmit(scratch);
352+
// overflowSubmit no longer blocks for the last submit to have completed, so we must do it ourselves here
353+
// TODO: if we cleverly overflowed BEFORE completely running out of memory (better heuristics) then we wouldn't need to do this and some CPU-GPU overlap could be achieved
354+
if (nextSubmit.overflowCallback)
355+
nextSubmit.overflowCallback(completed);
356+
m_device->blockForSemaphores({&completed,1});
377357
continue; // keep trying again
378358
}
379359
// some platforms expose non-coherent host-visible GPU memory, so writes need to be flushed explicitly
@@ -387,9 +367,9 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
387367
copy.srcOffset = localOffset;
388368
copy.dstOffset = bufferRange.offset+uploadedSize;
389369
copy.size = subSize;
390-
cmdbuf->copyBuffer(m_defaultUploadBuffer.get()->getBuffer(), bufferRange.buffer.get(), 1u, &copy);
370+
scratch->cmdbuf->copyBuffer(m_defaultUploadBuffer.get()->getBuffer(), bufferRange.buffer.get(), 1u, &copy);
391371
// this doesn't actually free the memory, the memory is queued up to be freed only after the `scratchSemaphore` reaches a value a future submit will signal
392-
m_defaultUploadBuffer.get()->multi_deallocate(1u,&localOffset,&allocationSize,nextSubmit.getFutureScratchSemaphore(),&cmdbuf);
372+
m_defaultUploadBuffer.get()->multi_deallocate(1u,&localOffset,&allocationSize,nextSubmit.getFutureScratchSemaphore(),&scratch->cmdbuf);
393373
uploadedSize += subSize;
394374
}
395375
return true;
@@ -520,14 +500,14 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
520500
return false;
521501
}
522502

523-
if (!commonTransferValidation(nextSubmit))
503+
auto* scratch = commonTransferValidation(nextSubmit);
504+
if (!scratch)
524505
return false;
525506

526507
const auto& limits = m_device->getPhysicalDevice()->getLimits();
527508
// TODO: Why did we settle on `/4` ? It definitely wasn't about the uint32_t size!
528509
const uint32_t optimalTransferAtom = core::min<uint32_t>(limits.maxResidentInvocations*OptimalCoalescedInvocationXferSize,m_defaultDownloadBuffer->get_total_size()/4);
529510

530-
auto cmdbuf = nextSubmit.getScratchCommandBuffer();
531511
// Basically downloadedSize is downloadRecordedIntoCommandBufferSize :D
532512
for (size_t downloadedSize=0ull; downloadedSize<srcBufferRange.size;)
533513
{
@@ -547,12 +527,12 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
547527
copy.srcOffset = srcBufferRange.offset + downloadedSize;
548528
copy.dstOffset = localOffset;
549529
copy.size = copySize;
550-
cmdbuf->copyBuffer(srcBufferRange.buffer.get(),m_defaultDownloadBuffer->getBuffer(),1u,&copy);
530+
scratch->cmdbuf->copyBuffer(srcBufferRange.buffer.get(),m_defaultDownloadBuffer->getBuffer(),1u,&copy);
551531

552532
auto dataConsumer = core::make_smart_refctd_ptr<CDownstreamingDataConsumer>(
553533
IDeviceMemoryAllocation::MemoryRange(localOffset,copySize),
554534
consumeCallback,
555-
core::smart_refctd_ptr<IGPUCommandBuffer>(cmdbuf),
535+
core::smart_refctd_ptr<IGPUCommandBuffer>(scratch->cmdbuf),
556536
m_defaultDownloadBuffer.get(),
557537
downloadedSize
558538
);
@@ -561,7 +541,15 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
561541
downloadedSize += copySize;
562542
}
563543
else // but first sumbit the already buffered up copies
564-
nextSubmit.overflowSubmit();
544+
{
545+
const auto completed = nextSubmit.getFutureScratchSemaphore();
546+
nextSubmit.overflowSubmit(scratch);
547+
// overflowSubmit no longer blocks for the last submit to have completed, so we must do it ourselves here
548+
// TODO: if we cleverly overflowed BEFORE completely running out of memory (better heuristics) then we wouldn't need to do this and some CPU-GPU overlap could be achieved
549+
if (nextSubmit.overflowCallback)
550+
nextSubmit.overflowCallback(completed);
551+
m_device->blockForSemaphores({&completed,1});
552+
}
565553
}
566554
return true;
567555
}
@@ -684,23 +672,24 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
684672

685673
protected:
686674
//
687-
inline bool commonTransferValidation(const SIntendedSubmitInfo& intendedNextSubmit)
675+
inline const IQueue::SSubmitInfo::SCommandBufferInfo* commonTransferValidation(const SIntendedSubmitInfo& intendedNextSubmit)
688676
{
689-
if (!intendedNextSubmit.valid())
677+
auto retval = intendedNextSubmit.valid();
678+
if (!retval)
690679
{
691680
m_logger.log("Invalid `intendedNextSubmit`.", nbl::system::ILogger::ELL_ERROR);
692-
return false;
681+
return nullptr;
693682
}
694683

695684
assert(intendedNextSubmit.queue);
696685
auto queueFamProps = m_device->getPhysicalDevice()->getQueueFamilyProperties()[intendedNextSubmit.queue->getFamilyIndex()];
697686
if (!queueFamProps.queueFlags.hasFlags(IQueue::FAMILY_FLAGS::TRANSFER_BIT))
698687
{
699688
m_logger.log("Invalid `intendedNextSubmit.queue` is not capable of transfer operations!", nbl::system::ILogger::ELL_ERROR);
700-
return false;
689+
return nullptr;
701690
}
702691

703-
return true;
692+
return retval;
704693
}
705694

706695
// The application must round down the start of the range to the nearest multiple of VkPhysicalDeviceLimits::nonCoherentAtomSize,

0 commit comments

Comments
 (0)