Skip to content

Commit b108039

Browse files
devshkeptsecret
authored andcommitted
fix certain things in Image conversion but convinced whole thing needs a rewrite due to poor usage of barriers
1 parent f4160b6 commit b108039

File tree

2 files changed

+84
-49
lines changed

2 files changed

+84
-49
lines changed

src/nbl/video/utilities/CAssetConverter.cpp

Lines changed: 83 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -3504,21 +3504,38 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
35043504
assert(reservations.m_converter.get()==this);
35053505
auto device = m_params.device;
35063506

3507-
const auto reqQueueFlags = reservations.getRequiredQueueFlags(false);
3508-
35093507
// compacted TLASes need to be substituted in cache and Descriptor Sets
35103508
core::unordered_map<const IGPUTopLevelAccelerationStructure*,smart_refctd_ptr<IGPUTopLevelAccelerationStructure>> compactedTLASMap;
35113509
// Anything to do?
3510+
auto reqQueueFlags = reservations.m_queueFlags;
35123511
if (reqQueueFlags.value!=IQueue::FAMILY_FLAGS::NONE)
35133512
{
3514-
auto familyNotInSpan = [](const uint32_t family, const uint32_t* families, const uint8_t count)->bool
3515-
{
3516-
const auto end = families+count;
3517-
return std::find(families,end,family)==end;
3518-
};
3519-
35203513
// whether we actually get around to doing that depends on validity and success of transfers
35213514
const bool shouldDoSomeCompute = reqQueueFlags.hasFlags(IQueue::FAMILY_FLAGS::COMPUTE_BIT);
3515+
auto invalidIntended = [device,logger](const IQueue::FAMILY_FLAGS flag, const SIntendedSubmitInfo* intended)->bool
3516+
{
3517+
if (!intended || !intended->valid())
3518+
{
3519+
logger.log("Invalid `SIntendedSubmitInfo` for queue capability %d!",system::ILogger::ELL_ERROR,flag);
3520+
return true;
3521+
}
3522+
const auto* queue = intended->queue;
3523+
if (queue->getOriginDevice()!=device)
3524+
{
3525+
logger.log("Provided Queue's device %p doesn't match CAssetConverter's device %p!",system::ILogger::ELL_ERROR,queue->getOriginDevice(),device);
3526+
return true;
3527+
}
3528+
const auto& qFamProps = device->getPhysicalDevice()->getQueueFamilyProperties();
3529+
if (!qFamProps[queue->getFamilyIndex()].queueFlags.hasFlags(flag))
3530+
{
3531+
logger.log("Provided Queue %p in Family %d does not have the required capabilities %d!",system::ILogger::ELL_ERROR,queue,queue->getFamilyIndex(),flag);
3532+
return true;
3533+
}
3534+
return false;
3535+
};
3536+
// If the compute queue will be used, the compute Intended Submit Info must be valid
3537+
if (shouldDoSomeCompute && invalidIntended(IQueue::FAMILY_FLAGS::COMPUTE_BIT,params.compute))
3538+
return retval;
35223539
// the flag check stops us derefercing an invalid pointer
35233540
const auto computeFamily = shouldDoSomeCompute ? params.compute->queue->getFamilyIndex():IQueue::FamilyIgnored;
35243541

@@ -3585,6 +3602,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
35853602
logger.log("Acceleration Structure Scratch Device Memory Allocator not mapped and not concurrently share-able by Transfer Family %d!",system::ILogger::ELL_ERROR,transferFamily);
35863603
return retval;
35873604
}
3605+
reqQueueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
35883606
}
35893607
}
35903608
// the elusive and exotic host builds
@@ -3600,43 +3618,19 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
36003618
return retval;
36013619
}
36023620

3603-
// TODO: work this out in a different way!
3604-
bool shouldDoSomeTransfer = !deviceASBuildScratchPtr || reqQueueFlags.hasFlags(IQueue::FAMILY_FLAGS::TRANSFER_BIT);
3605-
const auto transferFamily = shouldDoSomeTransfer ? params.transfer->queue->getFamilyIndex():IQueue::FamilyIgnored;
36063621
//
3622+
const auto reqQueueFlags = reservations.getRequiredQueueFlags(deviceASBuildScratchPtr);
3623+
bool shouldDoSomeTransfer = reqQueueFlags.hasFlags(IQueue::FAMILY_FLAGS::TRANSFER_BIT);
36073624
{
3608-
auto invalidIntended = [device,logger](const IQueue::FAMILY_FLAGS flag, const SIntendedSubmitInfo* intended)->bool
3609-
{
3610-
if (!intended || !intended->valid())
3611-
{
3612-
logger.log("Invalid `SIntendedSubmitInfo` for queue capability %d!",system::ILogger::ELL_ERROR,flag);
3613-
return true;
3614-
}
3615-
const auto* queue = intended->queue;
3616-
if (queue->getOriginDevice()!=device)
3617-
{
3618-
logger.log("Provided Queue's device %p doesn't match CAssetConverter's device %p!",system::ILogger::ELL_ERROR,queue->getOriginDevice(),device);
3619-
return true;
3620-
}
3621-
const auto& qFamProps = device->getPhysicalDevice()->getQueueFamilyProperties();
3622-
if (!qFamProps[queue->getFamilyIndex()].queueFlags.hasFlags(flag))
3623-
{
3624-
logger.log("Provided Queue %p in Family %d does not have the required capabilities %d!",system::ILogger::ELL_ERROR,queue,queue->getFamilyIndex(),flag);
3625-
return true;
3626-
}
3627-
return false;
3628-
};
36293625
// If the transfer queue will be used, the transfer Intended Submit Info must be valid and utilities must be provided
36303626
auto reqTransferQueueCaps = IQueue::FAMILY_FLAGS::TRANSFER_BIT;
36313627
// Depth/Stencil transfers need Graphics Capabilities, so make sure the queue chosen for transfers also has them!
36323628
if (reservations.m_queueFlags.hasFlags(IQueue::FAMILY_FLAGS::GRAPHICS_BIT))
36333629
reqTransferQueueCaps |= IQueue::FAMILY_FLAGS::GRAPHICS_BIT;
36343630
if (shouldDoSomeTransfer && invalidIntended(reqTransferQueueCaps,params.transfer))
36353631
return retval;
3636-
// If the compute queue will be used, the compute Intended Submit Info must be valid and utilities must be provided
3637-
if (shouldDoSomeCompute && invalidIntended(IQueue::FAMILY_FLAGS::COMPUTE_BIT,params.compute))
3638-
return retval;
36393632
}
3633+
const auto transferFamily = shouldDoSomeTransfer ? params.transfer->queue->getFamilyIndex():IQueue::FamilyIgnored;
36403634

36413635
// The current begun Xfer and Compute commandbuffer changing because of submit of Xfer or Compute would be a royal mess to deal with
36423636
if (shouldDoSomeTransfer && shouldDoSomeCompute)
@@ -3652,6 +3646,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
36523646
return retval;
36533647
}
36543648
}
3649+
const bool uniQueue = !shouldDoSomeTransfer || !shouldDoSomeCompute || params.transfer->queue->getNativeHandle()==params.compute->queue->getNativeHandle();
36553650

36563651
//
36573652
if (shouldDoSomeTransfer && (!params.utilities || params.utilities->getLogicalDevice()!=device))
@@ -3743,14 +3738,14 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
37433738
};
37443739

37453740
// some state so we don't need to look later
3746-
auto xferCmdBuf = params.transfer->getCommandBufferForRecording();
3741+
auto xferCmdBuf = shouldDoSomeTransfer ? params.transfer->getCommandBufferForRecording():nullptr;
37473742

37483743
using buffer_mem_barrier_t = IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier>;
37493744
// upload Buffers
37503745
auto& buffersToUpload = reservations.m_bufferConversions;
37513746
{
3752-
core::vector<buffer_mem_barrier_t> ownershipTransfers;
3753-
ownershipTransfers.reserve(buffersToUpload.size());
3747+
core::vector<buffer_mem_barrier_t> finalReleases;
3748+
finalReleases.reserve(buffersToUpload.size());
37543749
// do the uploads
37553750
if (!buffersToUpload.empty())
37563751
{
@@ -3787,7 +3782,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
37873782
submitsNeeded |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
37883783
// enqueue ownership release if necessary
37893784
if (ownerQueueFamily!=IQueue::FamilyIgnored)
3790-
ownershipTransfers.push_back({
3785+
finalReleases.push_back({
37913786
.barrier = {
37923787
.dep = {
37933788
.srcStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
@@ -3807,14 +3802,13 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
38073802
}
38083803
buffersToUpload.clear();
38093804
// release ownership
3810-
if (!ownershipTransfers.empty())
3811-
pipelineBarrier(xferCmdBuf,{.memBarriers={},.bufBarriers=ownershipTransfers},"Ownership Releases of Buffers Failed");
3805+
if (!finalReleases.empty())
3806+
pipelineBarrier(xferCmdBuf,{.memBarriers={},.bufBarriers=finalReleases},"Ownership Releases of Buffers Failed");
38123807
}
38133808

38143809
const auto* physDev = device->getPhysicalDevice();
38153810

3816-
const bool uniQueue = !shouldDoSomeCompute || params.transfer->queue->getNativeHandle()==params.compute->queue->getNativeHandle();
3817-
// whenever transfer needs to do a submit overflow because it ran out of memory for streaming an image, we can already submit the recorded mip-map compute shader dispatches
3811+
// whenever transfer needs to do a submit overflow because it ran out of memory for streaming, we can already submit the recorded compute shader dispatches
38183812
auto computeCmdBuf = shouldDoSomeCompute ? params.compute->getCommandBufferForRecording():nullptr;
38193813
auto drainCompute = [&params,&computeCmdBuf](const std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> extraSignal={})->auto
38203814
{
@@ -3823,6 +3817,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
38233817
// before we overflow submit we need to inject extra wait semaphores
38243818
auto& waitSemaphoreSpan = params.compute->waitSemaphores;
38253819
std::unique_ptr<IQueue::SSubmitInfo::SSemaphoreInfo[]> patchedWaits;
3820+
// the transfer scratch semaphore value, is from the last submit, not the future value we're enqueing all the deferred memory releases with
38263821
if (waitSemaphoreSpan.empty())
38273822
waitSemaphoreSpan = {&params.transfer->scratchSemaphore,1};
38283823
else
@@ -3852,6 +3847,13 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
38523847
if (origXferStallCallback)
38533848
origXferStallCallback(tillScratchResettable);
38543849
};
3850+
// when overflowing compute resources, we need to submit the Xfer before submitting Compute
3851+
auto drainBoth = [&params,&xferCmdBuf,&drainCompute](const std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> extraSignal={})->auto
3852+
{
3853+
if (xferCmdBuf && !xferCmdBuf->cmdbuf->empty())
3854+
params.transfer->overflowSubmit(xferCmdBuf);
3855+
return drainCompute();
3856+
};
38553857

38563858
auto& imagesToUpload = reservations.m_imageConversions;
38573859
if (!imagesToUpload.empty())
@@ -3895,6 +3897,11 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
38953897
}
38963898
auto quickWriteDescriptor = [device,logger,&dsAlloc](const uint32_t binding, const uint32_t arrayElement, core::smart_refctd_ptr<IGPUImageView> view)->bool
38973899
{
3900+
if (arrayElement==SubAllocatedDescriptorSet::invalid_value)
3901+
{
3902+
logger.log("Failed to allocate from binding %d in the Suballocated Descriptor Sets!",system::ILogger::ELL_ERROR,binding);
3903+
return false;
3904+
}
38983905
auto* ds = dsAlloc->getDescriptorSet();
38993906
IGPUDescriptorSet::SDescriptorInfo info = {};
39003907
info.desc = std::move(view);
@@ -3916,7 +3923,11 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
39163923

39173924
// because of the layout transitions
39183925
params.transfer->scratchSemaphore.stageMask |= PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS;
3919-
//
3926+
// TODO:: Shall we rewrite? e.g. we upload everything first, extra submit for QFOT pipeline barrier & transition in overflow callback, then record compute commands, and submit them, plus their final QFOTs
3927+
// Lets analyze sync cases:
3928+
// - Single Queue = Semaphore Signal is sufficient,
3929+
// - Two distinct Queues = no barrier, semaphore signal-wait is sufficient
3930+
// - Two distinct Queue Families Exclusive Sharing mode = QFOT necessary
39203931
core::vector<IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier>> transferBarriers;
39213932
core::vector<IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier>> computeBarriers;
39223933
transferBarriers.reserve(MaxMipLevelsPastBase);
@@ -3946,6 +3957,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
39463957
dsAlloc->multi_deallocate(SrcMipBinding,1,&srcIx,{});
39473958
});
39483959
IGPUImageView::E_TYPE viewType = IGPUImageView::E_TYPE::ET_2D_ARRAY;
3960+
// create Mipmapping source Image View, allocate its place in the descriptor set and write it
39493961
if (item.recomputeMips)
39503962
{
39513963
switch (creationParams.type)
@@ -3971,7 +3983,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
39713983
// its our own resource, it will eventually be free
39723984
while (dsAlloc->multi_allocate(SrcMipBinding,1,&srcIx)!=0)
39733985
{
3974-
drainCompute();
3986+
if (drainBoth()!=IQueue::RESULT::SUCCESS)
3987+
break;
39753988
//params.compute->overflowCallback(); // erm what semaphore would we even be waiting for? TODO: need an event handler/timeline method to give lowest latch event/semaphore value
39763989
dsAlloc->cull_frees();
39773990
}
@@ -3981,6 +3994,24 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
39813994
continue;
39823995
}
39833996
}
3997+
// there might be some QFOT releases from transfer to compute which need to happen before we execute Compute
3998+
auto drain = [&]()->bool
3999+
{
4000+
if (item.recomputeMips && transferBarriers.empty())
4001+
return drainCompute()==IQueue::RESULT::SUCCESS;
4002+
else if (pipelineBarrier(xferCmdBuf,{.memBarriers={},.bufBarriers={},.imgBarriers=transferBarriers},"Recording QFOT Release from Transfer Queue Familye after overflow failed"))
4003+
{
4004+
if (drainBoth()!=IQueue::RESULT::SUCCESS)
4005+
return false;
4006+
transferBarriers.clear();
4007+
}
4008+
else
4009+
{
4010+
markFailureInStaging("Image QFOT Pipeline Barrier",item.canonical,image,pFoundHash);
4011+
return false;
4012+
}
4013+
return true;
4014+
};
39844015
//
39854016
using layout_t = IGPUImage::LAYOUT;
39864017
// record optional transitions to transfer/mip recompute layout and optional transfers, then transitions to desired layout after transfer
@@ -4147,7 +4178,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
41474178
for (uint32_t i=0; dsAlloc->try_multi_allocate(DstMipBinding,1,&dstIx)!=0; i++)
41484179
{
41494180
if (i) // don't submit on first fail
4150-
drainCompute();
4181+
if (!drain())
4182+
break;
41514183
dsAlloc->cull_frees();
41524184
}
41534185
if (quickWriteDescriptor(DstMipBinding,dstIx,std::move(dstView)))
@@ -4236,9 +4268,10 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
42364268
// stall callback is only called if multiple buffering of scratch commandbuffers fails, we also want to submit compute if transfer was submitted
42374269
if (oldImmediateSubmitSignalValue != params.transfer->scratchSemaphore.value)
42384270
{
4239-
drainCompute();
42404271
// and our recording scratch commandbuffer most likely changed
42414272
xferCmdBuf = params.transfer->getCommandBufferForRecording();
4273+
if (!drain())
4274+
break;
42424275
}
42434276
}
42444277
// new layout becomes old
@@ -4249,7 +4282,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
42494282
if (sourceForNextMipCompute)
42504283
{
42514284
// If submitting to same queue, then we use compute commandbuffer to perform the barrier between Xfer and compute stages.
4252-
// also do this if no QFOT, because no barrier needed at all because layout stays unchanged and semaphore signal-wait perform big memory barriers
4285+
// also do this if no QFOT, because no barrier needed at all as layout stays unchanged and semaphore signal-wait perform big memory barriers
42534286
if (uniQueue || computeFamily==transferFamily || concurrentSharing)
42544287
continue;
42554288
// stay in the same layout, no transition (both match)
@@ -4299,10 +4332,12 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
42994332
markFailureInStaging("Image Data Upload Pipeline Barrier",item.canonical,image,pFoundHash);
43004333
continue;
43014334
}
4335+
// even if no uploads performed, we do layout transitions on empty images from Xfer Queue
43024336
submitsNeeded |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
43034337
}
43044338
if (!computeBarriers.empty())
43054339
{
4340+
// the RAII exiter does an immediate "failure deallocation" without any semaphore dependant deferral, so preempt it here
43064341
dsAlloc->multi_deallocate(SrcMipBinding,1,&srcIx,params.compute->getFutureScratchSemaphore());
43074342
if (!pipelineBarrier(computeCmdBuf,{.memBarriers={},.bufBarriers={},.imgBarriers=computeBarriers},"Final Pipeline Barrier recording to Compute Command Buffer failed"))
43084343
{

src/nbl/video/utilities/CComputeBlit.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ struct ConstevalParameters
9999
inputs.writeShaderCache = m_shaderCache.get();
100100
// no pipeline cache, because we only make the same pipeline once, ever
101101
auto reserveResults = info.converter->reserve(inputs);
102-
assert(reserveResults.getRequiredQueueFlags().value==IQueue::FAMILY_FLAGS::NONE);
102+
assert(reserveResults.getRequiredQueueFlags(false).value==IQueue::FAMILY_FLAGS::NONE);
103103

104104
// copy over the results
105105
{

0 commit comments

Comments
 (0)