@@ -3504,21 +3504,38 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
3504
3504
assert (reservations.m_converter .get ()==this );
3505
3505
auto device = m_params.device ;
3506
3506
3507
- const auto reqQueueFlags = reservations.getRequiredQueueFlags (false );
3508
-
3509
3507
// compacted TLASes need to be substituted in cache and Descriptor Sets
3510
3508
core::unordered_map<const IGPUTopLevelAccelerationStructure*,smart_refctd_ptr<IGPUTopLevelAccelerationStructure>> compactedTLASMap;
3511
3509
// Anything to do?
3510
+ auto reqQueueFlags = reservations.m_queueFlags ;
3512
3511
if (reqQueueFlags.value !=IQueue::FAMILY_FLAGS::NONE)
3513
3512
{
3514
- auto familyNotInSpan = [](const uint32_t family, const uint32_t * families, const uint8_t count)->bool
3515
- {
3516
- const auto end = families+count;
3517
- return std::find (families,end,family)==end;
3518
- };
3519
-
3520
3513
// whether we actually get around to doing that depends on validity and success of transfers
3521
3514
const bool shouldDoSomeCompute = reqQueueFlags.hasFlags (IQueue::FAMILY_FLAGS::COMPUTE_BIT);
3515
+ auto invalidIntended = [device,logger](const IQueue::FAMILY_FLAGS flag, const SIntendedSubmitInfo* intended)->bool
3516
+ {
3517
+ if (!intended || !intended->valid ())
3518
+ {
3519
+ logger.log (" Invalid `SIntendedSubmitInfo` for queue capability %d!" ,system::ILogger::ELL_ERROR,flag);
3520
+ return true ;
3521
+ }
3522
+ const auto * queue = intended->queue ;
3523
+ if (queue->getOriginDevice ()!=device)
3524
+ {
3525
+ logger.log (" Provided Queue's device %p doesn't match CAssetConverter's device %p!" ,system::ILogger::ELL_ERROR,queue->getOriginDevice (),device);
3526
+ return true ;
3527
+ }
3528
+ const auto & qFamProps = device->getPhysicalDevice ()->getQueueFamilyProperties ();
3529
+ if (!qFamProps[queue->getFamilyIndex ()].queueFlags .hasFlags (flag))
3530
+ {
3531
+ logger.log (" Provided Queue %p in Family %d does not have the required capabilities %d!" ,system::ILogger::ELL_ERROR,queue,queue->getFamilyIndex (),flag);
3532
+ return true ;
3533
+ }
3534
+ return false ;
3535
+ };
3536
+ // If the compute queue will be used, the compute Intended Submit Info must be valid
3537
+ if (shouldDoSomeCompute && invalidIntended (IQueue::FAMILY_FLAGS::COMPUTE_BIT,params.compute ))
3538
+ return retval;
3522
3539
// the flag check stops us derefercing an invalid pointer
3523
3540
const auto computeFamily = shouldDoSomeCompute ? params.compute ->queue ->getFamilyIndex ():IQueue::FamilyIgnored;
3524
3541
@@ -3585,6 +3602,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
3585
3602
logger.log (" Acceleration Structure Scratch Device Memory Allocator not mapped and not concurrently share-able by Transfer Family %d!" ,system::ILogger::ELL_ERROR,transferFamily);
3586
3603
return retval;
3587
3604
}
3605
+ reqQueueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
3588
3606
}
3589
3607
}
3590
3608
// the elusive and exotic host builds
@@ -3600,43 +3618,19 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
3600
3618
return retval;
3601
3619
}
3602
3620
3603
- // TODO: work this out in a different way!
3604
- bool shouldDoSomeTransfer = !deviceASBuildScratchPtr || reqQueueFlags.hasFlags (IQueue::FAMILY_FLAGS::TRANSFER_BIT);
3605
- const auto transferFamily = shouldDoSomeTransfer ? params.transfer ->queue ->getFamilyIndex ():IQueue::FamilyIgnored;
3606
3621
//
3622
+ const auto reqQueueFlags = reservations.getRequiredQueueFlags (deviceASBuildScratchPtr);
3623
+ bool shouldDoSomeTransfer = reqQueueFlags.hasFlags (IQueue::FAMILY_FLAGS::TRANSFER_BIT);
3607
3624
{
3608
- auto invalidIntended = [device,logger](const IQueue::FAMILY_FLAGS flag, const SIntendedSubmitInfo* intended)->bool
3609
- {
3610
- if (!intended || !intended->valid ())
3611
- {
3612
- logger.log (" Invalid `SIntendedSubmitInfo` for queue capability %d!" ,system::ILogger::ELL_ERROR,flag);
3613
- return true ;
3614
- }
3615
- const auto * queue = intended->queue ;
3616
- if (queue->getOriginDevice ()!=device)
3617
- {
3618
- logger.log (" Provided Queue's device %p doesn't match CAssetConverter's device %p!" ,system::ILogger::ELL_ERROR,queue->getOriginDevice (),device);
3619
- return true ;
3620
- }
3621
- const auto & qFamProps = device->getPhysicalDevice ()->getQueueFamilyProperties ();
3622
- if (!qFamProps[queue->getFamilyIndex ()].queueFlags .hasFlags (flag))
3623
- {
3624
- logger.log (" Provided Queue %p in Family %d does not have the required capabilities %d!" ,system::ILogger::ELL_ERROR,queue,queue->getFamilyIndex (),flag);
3625
- return true ;
3626
- }
3627
- return false ;
3628
- };
3629
3625
// If the transfer queue will be used, the transfer Intended Submit Info must be valid and utilities must be provided
3630
3626
auto reqTransferQueueCaps = IQueue::FAMILY_FLAGS::TRANSFER_BIT;
3631
3627
// Depth/Stencil transfers need Graphics Capabilities, so make sure the queue chosen for transfers also has them!
3632
3628
if (reservations.m_queueFlags .hasFlags (IQueue::FAMILY_FLAGS::GRAPHICS_BIT))
3633
3629
reqTransferQueueCaps |= IQueue::FAMILY_FLAGS::GRAPHICS_BIT;
3634
3630
if (shouldDoSomeTransfer && invalidIntended (reqTransferQueueCaps,params.transfer ))
3635
3631
return retval;
3636
- // If the compute queue will be used, the compute Intended Submit Info must be valid and utilities must be provided
3637
- if (shouldDoSomeCompute && invalidIntended (IQueue::FAMILY_FLAGS::COMPUTE_BIT,params.compute ))
3638
- return retval;
3639
3632
}
3633
+ const auto transferFamily = shouldDoSomeTransfer ? params.transfer ->queue ->getFamilyIndex ():IQueue::FamilyIgnored;
3640
3634
3641
3635
// The current begun Xfer and Compute commandbuffer changing because of submit of Xfer or Compute would be a royal mess to deal with
3642
3636
if (shouldDoSomeTransfer && shouldDoSomeCompute)
@@ -3652,6 +3646,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
3652
3646
return retval;
3653
3647
}
3654
3648
}
3649
+ const bool uniQueue = !shouldDoSomeTransfer || !shouldDoSomeCompute || params.transfer ->queue ->getNativeHandle ()==params.compute ->queue ->getNativeHandle ();
3655
3650
3656
3651
//
3657
3652
if (shouldDoSomeTransfer && (!params.utilities || params.utilities ->getLogicalDevice ()!=device))
@@ -3743,14 +3738,14 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
3743
3738
};
3744
3739
3745
3740
// some state so we don't need to look later
3746
- auto xferCmdBuf = params.transfer ->getCommandBufferForRecording ();
3741
+ auto xferCmdBuf = shouldDoSomeTransfer ? params.transfer ->getCommandBufferForRecording (): nullptr ;
3747
3742
3748
3743
using buffer_mem_barrier_t = IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier>;
3749
3744
// upload Buffers
3750
3745
auto & buffersToUpload = reservations.m_bufferConversions ;
3751
3746
{
3752
- core::vector<buffer_mem_barrier_t > ownershipTransfers ;
3753
- ownershipTransfers .reserve (buffersToUpload.size ());
3747
+ core::vector<buffer_mem_barrier_t > finalReleases ;
3748
+ finalReleases .reserve (buffersToUpload.size ());
3754
3749
// do the uploads
3755
3750
if (!buffersToUpload.empty ())
3756
3751
{
@@ -3787,7 +3782,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
3787
3782
submitsNeeded |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
3788
3783
// enqueue ownership release if necessary
3789
3784
if (ownerQueueFamily!=IQueue::FamilyIgnored)
3790
- ownershipTransfers .push_back ({
3785
+ finalReleases .push_back ({
3791
3786
.barrier = {
3792
3787
.dep = {
3793
3788
.srcStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
@@ -3807,14 +3802,13 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
3807
3802
}
3808
3803
buffersToUpload.clear ();
3809
3804
// release ownership
3810
- if (!ownershipTransfers .empty ())
3811
- pipelineBarrier (xferCmdBuf,{.memBarriers ={},.bufBarriers =ownershipTransfers }," Ownership Releases of Buffers Failed" );
3805
+ if (!finalReleases .empty ())
3806
+ pipelineBarrier (xferCmdBuf,{.memBarriers ={},.bufBarriers =finalReleases }," Ownership Releases of Buffers Failed" );
3812
3807
}
3813
3808
3814
3809
const auto * physDev = device->getPhysicalDevice ();
3815
3810
3816
- const bool uniQueue = !shouldDoSomeCompute || params.transfer ->queue ->getNativeHandle ()==params.compute ->queue ->getNativeHandle ();
3817
- // whenever transfer needs to do a submit overflow because it ran out of memory for streaming an image, we can already submit the recorded mip-map compute shader dispatches
3811
+ // whenever transfer needs to do a submit overflow because it ran out of memory for streaming, we can already submit the recorded compute shader dispatches
3818
3812
auto computeCmdBuf = shouldDoSomeCompute ? params.compute ->getCommandBufferForRecording ():nullptr ;
3819
3813
auto drainCompute = [¶ms,&computeCmdBuf](const std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> extraSignal={})->auto
3820
3814
{
@@ -3823,6 +3817,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
3823
3817
// before we overflow submit we need to inject extra wait semaphores
3824
3818
auto & waitSemaphoreSpan = params.compute ->waitSemaphores ;
3825
3819
std::unique_ptr<IQueue::SSubmitInfo::SSemaphoreInfo[]> patchedWaits;
3820
+ // the transfer scratch semaphore value, is from the last submit, not the future value we're enqueing all the deferred memory releases with
3826
3821
if (waitSemaphoreSpan.empty ())
3827
3822
waitSemaphoreSpan = {¶ms.transfer ->scratchSemaphore ,1 };
3828
3823
else
@@ -3852,6 +3847,13 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
3852
3847
if (origXferStallCallback)
3853
3848
origXferStallCallback (tillScratchResettable);
3854
3849
};
3850
+ // when overflowing compute resources, we need to submit the Xfer before submitting Compute
3851
+ auto drainBoth = [¶ms,&xferCmdBuf,&drainCompute](const std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> extraSignal={})->auto
3852
+ {
3853
+ if (xferCmdBuf && !xferCmdBuf->cmdbuf ->empty ())
3854
+ params.transfer ->overflowSubmit (xferCmdBuf);
3855
+ return drainCompute ();
3856
+ };
3855
3857
3856
3858
auto & imagesToUpload = reservations.m_imageConversions ;
3857
3859
if (!imagesToUpload.empty ())
@@ -3895,6 +3897,11 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
3895
3897
}
3896
3898
auto quickWriteDescriptor = [device,logger,&dsAlloc](const uint32_t binding, const uint32_t arrayElement, core::smart_refctd_ptr<IGPUImageView> view)->bool
3897
3899
{
3900
+ if (arrayElement==SubAllocatedDescriptorSet::invalid_value)
3901
+ {
3902
+ logger.log (" Failed to allocate from binding %d in the Suballocated Descriptor Sets!" ,system::ILogger::ELL_ERROR,binding);
3903
+ return false ;
3904
+ }
3898
3905
auto * ds = dsAlloc->getDescriptorSet ();
3899
3906
IGPUDescriptorSet::SDescriptorInfo info = {};
3900
3907
info.desc = std::move (view);
@@ -3916,7 +3923,11 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
3916
3923
3917
3924
// because of the layout transitions
3918
3925
params.transfer ->scratchSemaphore .stageMask |= PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS;
3919
- //
3926
+ // TODO:: Shall we rewrite? e.g. we upload everything first, extra submit for QFOT pipeline barrier & transition in overflow callback, then record compute commands, and submit them, plus their final QFOTs
3927
+ // Lets analyze sync cases:
3928
+ // - Single Queue = Semaphore Signal is sufficient,
3929
+ // - Two distinct Queues = no barrier, semaphore signal-wait is sufficient
3930
+ // - Two distinct Queue Families Exclusive Sharing mode = QFOT necessary
3920
3931
core::vector<IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier>> transferBarriers;
3921
3932
core::vector<IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier>> computeBarriers;
3922
3933
transferBarriers.reserve (MaxMipLevelsPastBase);
@@ -3946,6 +3957,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
3946
3957
dsAlloc->multi_deallocate (SrcMipBinding,1 ,&srcIx,{});
3947
3958
});
3948
3959
IGPUImageView::E_TYPE viewType = IGPUImageView::E_TYPE::ET_2D_ARRAY;
3960
+ // create Mipmapping source Image View, allocate its place in the descriptor set and write it
3949
3961
if (item.recomputeMips )
3950
3962
{
3951
3963
switch (creationParams.type )
@@ -3971,7 +3983,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
3971
3983
// its our own resource, it will eventually be free
3972
3984
while (dsAlloc->multi_allocate (SrcMipBinding,1 ,&srcIx)!=0 )
3973
3985
{
3974
- drainCompute ();
3986
+ if (drainBoth ()!=IQueue::RESULT::SUCCESS)
3987
+ break ;
3975
3988
// params.compute->overflowCallback(); // erm what semaphore would we even be waiting for? TODO: need an event handler/timeline method to give lowest latch event/semaphore value
3976
3989
dsAlloc->cull_frees ();
3977
3990
}
@@ -3981,6 +3994,24 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
3981
3994
continue ;
3982
3995
}
3983
3996
}
3997
+ // there might be some QFOT releases from transfer to compute which need to happen before we execute Compute
3998
+ auto drain = [&]()->bool
3999
+ {
4000
+ if (item.recomputeMips && transferBarriers.empty ())
4001
+ return drainCompute ()==IQueue::RESULT::SUCCESS;
4002
+ else if (pipelineBarrier (xferCmdBuf,{.memBarriers ={},.bufBarriers ={},.imgBarriers =transferBarriers}," Recording QFOT Release from Transfer Queue Familye after overflow failed" ))
4003
+ {
4004
+ if (drainBoth ()!=IQueue::RESULT::SUCCESS)
4005
+ return false ;
4006
+ transferBarriers.clear ();
4007
+ }
4008
+ else
4009
+ {
4010
+ markFailureInStaging (" Image QFOT Pipeline Barrier" ,item.canonical ,image,pFoundHash);
4011
+ return false ;
4012
+ }
4013
+ return true ;
4014
+ };
3984
4015
//
3985
4016
using layout_t = IGPUImage::LAYOUT;
3986
4017
// record optional transitions to transfer/mip recompute layout and optional transfers, then transitions to desired layout after transfer
@@ -4147,7 +4178,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4147
4178
for (uint32_t i=0 ; dsAlloc->try_multi_allocate (DstMipBinding,1 ,&dstIx)!=0 ; i++)
4148
4179
{
4149
4180
if (i) // don't submit on first fail
4150
- drainCompute ();
4181
+ if (!drain ())
4182
+ break ;
4151
4183
dsAlloc->cull_frees ();
4152
4184
}
4153
4185
if (quickWriteDescriptor (DstMipBinding,dstIx,std::move (dstView)))
@@ -4236,9 +4268,10 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4236
4268
// stall callback is only called if multiple buffering of scratch commandbuffers fails, we also want to submit compute if transfer was submitted
4237
4269
if (oldImmediateSubmitSignalValue != params.transfer ->scratchSemaphore .value )
4238
4270
{
4239
- drainCompute ();
4240
4271
// and our recording scratch commandbuffer most likely changed
4241
4272
xferCmdBuf = params.transfer ->getCommandBufferForRecording ();
4273
+ if (!drain ())
4274
+ break ;
4242
4275
}
4243
4276
}
4244
4277
// new layout becomes old
@@ -4249,7 +4282,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4249
4282
if (sourceForNextMipCompute)
4250
4283
{
4251
4284
// If submitting to same queue, then we use compute commandbuffer to perform the barrier between Xfer and compute stages.
4252
- // also do this if no QFOT, because no barrier needed at all because layout stays unchanged and semaphore signal-wait perform big memory barriers
4285
+ // also do this if no QFOT, because no barrier needed at all as layout stays unchanged and semaphore signal-wait perform big memory barriers
4253
4286
if (uniQueue || computeFamily==transferFamily || concurrentSharing)
4254
4287
continue ;
4255
4288
// stay in the same layout, no transition (both match)
@@ -4299,10 +4332,12 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4299
4332
markFailureInStaging (" Image Data Upload Pipeline Barrier" ,item.canonical ,image,pFoundHash);
4300
4333
continue ;
4301
4334
}
4335
+ // even if no uploads performed, we do layout transitions on empty images from Xfer Queue
4302
4336
submitsNeeded |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
4303
4337
}
4304
4338
if (!computeBarriers.empty ())
4305
4339
{
4340
+ // the RAII exiter does an immediate "failure deallocation" without any semaphore dependant deferral, so preempt it here
4306
4341
dsAlloc->multi_deallocate (SrcMipBinding,1 ,&srcIx,params.compute ->getFutureScratchSemaphore ());
4307
4342
if (!pipelineBarrier (computeCmdBuf,{.memBarriers ={},.bufBarriers ={},.imgBarriers =computeBarriers}," Final Pipeline Barrier recording to Compute Command Buffer failed" ))
4308
4343
{
0 commit comments