@@ -445,7 +445,7 @@ class AssetVisitor : public CRTP
445
445
}
446
446
447
447
private:
448
- // there is no `impl()` overload taking `ICPUTopLevelAccelerationStructure ` same as there is no `ICPUmage`
448
+ // there is no `impl()` overload taking `ICPUBottomLevelAccelerationStructure ` same as there is no `ICPUmage`
449
449
inline bool impl (const instance_t <ICPUTopLevelAccelerationStructure>& instance, const CAssetConverter::patch_t <ICPUTopLevelAccelerationStructure>& userPatch)
450
450
{
451
451
const auto blasInstances = instance.asset ->getInstances ();
@@ -1656,6 +1656,9 @@ class GetDependantVisit;
1656
1656
template <>
1657
1657
class GetDependantVisit <ICPUTopLevelAccelerationStructure> : public GetDependantVisitBase<ICPUTopLevelAccelerationStructure>
1658
1658
{
1659
+ public:
1660
+ CAssetConverter::SReserveResult::SConvReqTLAS::cpu_to_gpu_blas_map_t * instanceMap;
1661
+
1659
1662
protected:
1660
1663
bool descend_impl (
1661
1664
const instance_t <AssetType>& user, const CAssetConverter::patch_t <AssetType>& userPatch,
@@ -1666,6 +1669,7 @@ class GetDependantVisit<ICPUTopLevelAccelerationStructure> : public GetDependant
1666
1669
auto depObj = getDependant<ICPUBottomLevelAccelerationStructure>(dep,soloPatch);
1667
1670
if (!depObj)
1668
1671
return false ;
1672
+ instanceMap->operator [](dep.asset ) = std::move (depObj);
1669
1673
return true ;
1670
1674
}
1671
1675
};
@@ -3397,9 +3401,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
3397
3401
// now allocate the memory for buffers and images
3398
3402
deferredAllocator.finalize ();
3399
3403
3400
- // TODO: everything below is slightly wrong due to not having a final top-down dependency checking pass throwing away useless non-root GPU subtrees
3401
-
3402
- // find out which buffers need to be uploaded via a staging buffer
3404
+ // enqueue successfully created buffers for conversion
3403
3405
for (auto & entry : bufferConversions.contentHashToCanonical )
3404
3406
for (auto i=0ull ; i<entry.second .copyCount ; i++)
3405
3407
if (auto & gpuBuff=bufferConversions.gpuObjects [i+entry.second .firstCopyIx ].value ; gpuBuff)
@@ -3414,7 +3416,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
3414
3416
{
3415
3417
constexpr bool IsTLAS = std::is_same_v<AccelerationStructure,ICPUTopLevelAccelerationStructure>;
3416
3418
//
3417
- SReserveResult::SConvReqAccelerationStructureMap<AccelerationStructure >* pConversions;
3419
+ std:: conditional_t <IsTLAS,SReserveResult::SConvReqTLASMap,SReserveResult::SConvReqBLASMap >* pConversions;
3418
3420
if constexpr (IsTLAS)
3419
3421
pConversions = retval.m_tlasConversions ;
3420
3422
else
@@ -3437,11 +3439,12 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
3437
3439
};
3438
3440
}
3439
3441
smart_refctd_ptr<typename asset_traits<AccelerationStructure>::video_t > as;
3442
+ CAssetConverter::SReserveResult::SConvReqTLAS::cpu_to_gpu_blas_map_t blasInstanceMap;
3440
3443
if constexpr (IsTLAS)
3441
3444
{
3442
3445
// check if the BLASes we want to use for the instances were successfully allocated and created
3443
3446
AssetVisitor<GetDependantVisit<ICPUTopLevelAccelerationStructure>> visitor = {
3444
- {inputs,dfsCaches},
3447
+ {inputs,dfsCaches,&blasInstanceMap },
3445
3448
{canonical,deferredParams.uniqueCopyGroupID },
3446
3449
patch
3447
3450
};
@@ -3469,14 +3472,16 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
3469
3472
request.compact = patch.compactAfterBuild ;
3470
3473
request.buildFlags = static_cast <uint16_t >(patch.getBuildFlags (canonical).value );
3471
3474
request.buildSize = deferredParams.buildSize ;
3475
+ if constexpr (IsTLAS)
3476
+ request.instanceMap = std::move (blasInstanceMap);
3472
3477
}
3473
3478
};
3474
3479
createAccelerationStructures.template operator ()<ICPUBottomLevelAccelerationStructure>();
3475
3480
blasConversions.propagateToCaches (std::get<dfs_cache<ICPUBottomLevelAccelerationStructure>>(dfsCaches),std::get<SReserveResult::staging_cache_t <ICPUBottomLevelAccelerationStructure>>(retval.m_stagingCaches ));
3476
3481
createAccelerationStructures.template operator ()<ICPUTopLevelAccelerationStructure>();
3477
3482
tlasConversions.propagateToCaches (std::get<dfs_cache<ICPUTopLevelAccelerationStructure>>(dfsCaches),std::get<SReserveResult::staging_cache_t <ICPUTopLevelAccelerationStructure>>(retval.m_stagingCaches ));
3478
3483
}
3479
- // find out which images need what caps for the transfer and mipmapping
3484
+ // enqueue successfully created images with data to upload for conversion
3480
3485
auto & dfsCacheImages = std::get<dfs_cache<ICPUImage>>(dfsCaches);
3481
3486
for (auto & entry : imageConversions.contentHashToCanonical )
3482
3487
for (auto i=0ull ; i<entry.second .copyCount ; i++)
@@ -3584,26 +3589,6 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
3584
3589
pruneStaging.template operator ()<ICPUBufferView>();
3585
3590
pruneStaging.template operator ()<ICPUImage>();
3586
3591
pruneStaging.template operator ()<ICPUTopLevelAccelerationStructure>();
3587
- // go over future TLAS builds to gather used BLASes
3588
- for (auto i=0 ; i<2 ; i++)
3589
- for (const auto & req : retval.m_tlasConversions [i])
3590
- {
3591
- auto * const cpuTLAS = req.second .canonical .get ();
3592
- assert (cpuTLAS);
3593
- for (const auto & instance : cpuTLAS->getInstances ())
3594
- {
3595
- auto * const cpuBLAS = instance.getBase ().blas .get ();
3596
- auto foundBLAS = retval.m_blasBuildMap .find (cpuBLAS);
3597
- if (foundBLAS!=retval.m_blasBuildMap .end ())
3598
- foundBLAS->second .remainingUsages ++;
3599
- else
3600
- {
3601
- smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure> gpuBLAS;
3602
- // TODO: figure out the BLAS that will be used, (this requires UUID)
3603
- retval.m_blasBuildMap .insert (foundBLAS,{cpuBLAS,{std::move (gpuBLAS),1 ,1 }});
3604
- }
3605
- }
3606
- }
3607
3592
pruneStaging.template operator ()<ICPUBottomLevelAccelerationStructure>();
3608
3593
pruneStaging.template operator ()<ICPUBuffer>();
3609
3594
}
@@ -3620,7 +3605,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
3620
3605
// index 0 is device build, 1 is host build
3621
3606
size_t scratchSizeFullParallelBuild[2 ] = {0 ,0 };
3622
3607
//
3623
- const SReserveResult::SConvReqAccelerationStructureMap<AccelerationStructure >* pConversions;
3608
+ const std:: conditional_t <IsTLAS,SReserveResult::SConvReqTLASMap,SReserveResult::SConvReqBLASMap >* pConversions;
3624
3609
if constexpr (IsTLAS)
3625
3610
pConversions = retval.m_tlasConversions ;
3626
3611
else
@@ -3755,7 +3740,25 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
3755
3740
}
3756
3741
};
3757
3742
3758
- // compacted TLASes need to be substituted in cache and Descriptor Sets
3743
+ // want to check if deps successfully exist
3744
+ struct SMissingDependent
3745
+ {
3746
+ // This only checks if whether we had to convert and failed, but the dependent might be in readCache of one or more converters, so if in doubt assume its okay
3747
+ inline operator bool () const {return wasInStaging && gotWiped;}
3748
+
3749
+ bool wasInStaging;
3750
+ bool gotWiped;
3751
+ };
3752
+ auto missingDependent = [&reservations]<Asset AssetType>(const typename asset_traits<AssetType>::video_t * dep)->SMissingDependent
3753
+ {
3754
+ auto & stagingCache = std::get<SReserveResult::staging_cache_t <AssetType>>(reservations.m_stagingCaches );
3755
+ auto found = stagingCache.find (const_cast <typename asset_traits<AssetType>::video_t *>(dep));
3756
+ SMissingDependent retval = {.wasInStaging =found!=stagingCache.end ()};
3757
+ retval.gotWiped = retval.wasInStaging && found->second .value ==CHashCache::NoContentHash;
3758
+ return retval;
3759
+ };
3760
+
3761
+ // Descriptor Sets need their TLAS descriptors substituted if they've been compacted
3759
3762
core::unordered_map<const IGPUTopLevelAccelerationStructure*,smart_refctd_ptr<IGPUTopLevelAccelerationStructure>> compactedTLASMap;
3760
3763
// Anything to do?
3761
3764
auto reqQueueFlags = reservations.m_queueFlags ;
@@ -4672,6 +4675,9 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4672
4675
.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT
4673
4676
};
4674
4677
4678
+ // compacted BLASes need to be substituted in cache and TLAS Build Inputs
4679
+ using compacted_blas_map_t = core::unordered_map<const IGPUBottomLevelAccelerationStructure*,smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>>;
4680
+ compacted_blas_map_t compactedBLASMap;
4675
4681
// Device BLAS builds
4676
4682
if (blasCount)
4677
4683
{
@@ -4749,7 +4755,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4749
4755
computeCmdBuf->cmdbuf ->endDebugMarker ();
4750
4756
{
4751
4757
// the already compacted BLASes need to be written into the TLASes using them, want to swap them out ASAP
4752
- // reservations.m_blasBuildMap[canonical].gpuBLAS = compacted;
4758
+ // compactedBLASMap[as] = compacted;
4753
4759
}
4754
4760
computeCmdBuf->cmdbuf ->beginDebugMarker (" Asset Converter Compact BLASes END" );
4755
4761
computeCmdBuf->cmdbuf ->endDebugMarker ();
@@ -4801,11 +4807,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4801
4807
using scratch_allocator_t = std::remove_reference_t <decltype (*params.scratchForDeviceASBuild )>;
4802
4808
using addr_t = typename scratch_allocator_t ::size_type;
4803
4809
const auto & limits = physDev->getLimits ();
4804
- core::unordered_set<smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>> dedupBLASesUsed;
4805
- dedupBLASesUsed.reserve (reservations.m_blasBuildMap .size ());
4806
4810
for (auto & tlasToBuild : tlasesToBuild)
4807
4811
{
4808
- dedupBLASesUsed.clear ();
4809
4812
auto & canonical = tlasToBuild.second .canonical ;
4810
4813
const auto as = tlasToBuild.first ;
4811
4814
const auto pFoundHash = findInStaging.template operator ()<ICPUTopLevelAccelerationStructure>(as);
@@ -4819,19 +4822,30 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4819
4822
}
4820
4823
const auto instances = canonical->getInstances ();
4821
4824
const auto instanceCount = static_cast <uint32_t >(instances.size ());
4825
+ const auto & instanceMap = tlasToBuild.second .instanceMap ;
4822
4826
size_t instanceDataSize = 0 ;
4823
4827
// gather total input size and check dependants exist
4828
+ bool dependsOnBLASBuilds = false ;
4824
4829
for (const auto & instance : instances)
4825
4830
{
4826
- // failed BLAS builds erase themselves from this map, so this checks if some BLAS used but which had to be built failed the build
4827
- const auto found = reservations.m_blasBuildMap .find (instance.getBase ().blas .get ());
4828
- if (found==reservations.m_blasBuildMap .end () || failedBLASBarrier && found->second .buildDuringConvertCall )
4831
+ auto found = instanceMap.find (instance.getBase ().blas .get ());
4832
+ assert (instanceMap.end ()!=found);
4833
+ const auto depInfo = missingDependent.template operator ()<ICPUBottomLevelAccelerationStructure>(found->second .get ());
4834
+ if (depInfo)
4829
4835
{
4830
4836
instanceDataSize = 0 ;
4831
4837
break ;
4832
4838
}
4839
+ if (depInfo.wasInStaging )
4840
+ dependsOnBLASBuilds;
4833
4841
instanceDataSize += ITopLevelAccelerationStructure::getInstanceSize (instance.getType ());
4834
4842
}
4843
+ // problem with building some Dependent BLASes
4844
+ if (failedBLASBarrier && dependsOnBLASBuilds)
4845
+ {
4846
+ markFailureInStaging (" building BLASes which current TLAS build wants to instance" ,canonical,as,pFoundHash);
4847
+ continue ;
4848
+ }
4835
4849
// problem with finding the dependents (BLASes)
4836
4850
if (instanceDataSize==0 )
4837
4851
{
@@ -4862,6 +4876,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4862
4876
params.scratchForDeviceASBuild ->multi_deallocate (AllocCount,&offsets[0 ],&sizes[0 ],params.compute ->getFutureScratchSemaphore ());
4863
4877
}
4864
4878
// stream the instance/geometry input in
4879
+ const size_t trackedBLASesOffset = trackedBLASes.size ();
4865
4880
{
4866
4881
bool success = true ;
4867
4882
{
@@ -4881,27 +4896,30 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4881
4896
const auto newWritten = bytesWritten+size;
4882
4897
if (newWritten>=blockSize)
4883
4898
return bytesWritten;
4884
- auto found = blasBuildMap->find (instance.getBase ().blas .get ());
4885
- assert (found!=blasBuildMap->end ());
4886
- const auto & blas = found->second .gpuBLAS ;
4887
- dst = IGPUTopLevelAccelerationStructure::writeInstance (dst,instance,blas.get ()->getReferenceForDeviceOperations ());
4888
- dedupBLASesUsed->emplace (blas);
4889
- if (--found->second .remainingUsages == 0 )
4890
- blasBuildMap->erase (found);
4899
+ auto found = instanceMap->find (instance.getBase ().blas .get ());
4900
+ auto blas = found->second .get ();
4901
+ if (auto found=compactedBLASMap->find (blas); found!=compactedBLASMap->end ())
4902
+ blas = found->second .get ();
4903
+ trackedBLASes->emplace_back (blas);
4904
+ dst = IGPUTopLevelAccelerationStructure::writeInstance (dst,instance,blas->getReferenceForDeviceOperations ());
4891
4905
bytesWritten = newWritten;
4892
4906
}
4893
4907
}
4894
4908
4895
- SReserveResult::cpu_to_gpu_blas_map_t * blasBuildMap;
4896
- core::unordered_set<smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>>* dedupBLASesUsed;
4909
+ const compacted_blas_map_t * compactedBLASMap;
4910
+ core::vector<smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>>* trackedBLASes;
4911
+ SReserveResult::SConvReqTLAS::cpu_to_gpu_blas_map_t * instanceMap;
4897
4912
std::span<const ICPUTopLevelAccelerationStructure::PolymorphicInstance> instances;
4898
4913
uint32_t instanceIndex = 0 ;
4899
4914
};
4900
4915
FillInstances fillInstances;
4901
- fillInstances.blasBuildMap = &reservations.m_blasBuildMap ;
4902
- fillInstances.dedupBLASesUsed = &dedupBLASesUsed;
4916
+ fillInstances.compactedBLASMap = &compactedBLASMap;
4917
+ fillInstances.trackedBLASes = &trackedBLASes;
4918
+ fillInstances.instanceMap = &tlasToBuild.second .instanceMap ;
4903
4919
fillInstances.instances = instances;
4904
4920
success = streamDataToScratch (offsets[1 ],sizes[1 ],fillInstances);
4921
+ // provoke refcounting bugs right away
4922
+ tlasToBuild.second .instanceMap .clear ();
4905
4923
}
4906
4924
if (success && as->usesMotion ())
4907
4925
{
@@ -4935,6 +4953,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4935
4953
xferCmdBuf = params.transfer ->getCommandBufferForRecording ();
4936
4954
if (!success)
4937
4955
{
4956
+ trackedBLASes.resize (trackedBLASesOffset);
4938
4957
markFailureInStaging (" Uploading Instance Data for TLAS build failed" ,canonical,as,pFoundHash);
4939
4958
continue ;
4940
4959
}
@@ -4950,14 +4969,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4950
4969
// note we don't build directly from staging, because only very small inputs could come from there and they'd impede the transfer efficiency of the larger ones
4951
4970
buildInfo.instanceData = {.offset =offsets[as->usesMotion () ? 2 :1 ],.buffer =smart_refctd_ptr<IGPUBuffer>(scratchBuffer)};
4952
4971
// be based cause vectors can grow
4953
- {
4954
- const auto offset = trackedBLASes.size ();
4955
- using p_p_BLAS_t = const IGPUBottomLevelAccelerationStructure**;
4956
- buildInfo.trackedBLASes = {reinterpret_cast <const p_p_BLAS_t&>(offset),dedupBLASesUsed.size ()};
4957
- for (auto & blas : dedupBLASesUsed)
4958
- trackedBLASes.emplace_back (std::move (blas));
4959
-
4960
- }
4972
+ using p_p_BLAS_t = const IGPUBottomLevelAccelerationStructure**;
4973
+ buildInfo.trackedBLASes = {reinterpret_cast <const p_p_BLAS_t&>(trackedBLASesOffset),trackedBLASes.size ()-trackedBLASesOffset};
4961
4974
// no special extra byte offset into the instance buffer
4962
4975
rangeInfos.emplace_back (instanceCount,0u );
4963
4976
//
@@ -4984,7 +4997,6 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4984
4997
else
4985
4998
compactedOwnershipReleaseIndices.push_back (~0u );
4986
4999
}
4987
- reservations.m_blasBuildMap .clear ();
4988
5000
// finish the last batch
4989
5001
recordBuildCommands ();
4990
5002
if (!flushRanges.empty ())
@@ -5154,18 +5166,6 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
5154
5166
// finish host tasks if not done yet
5155
5167
hostUploadBuffers ([]()->bool {return true ;});
5156
5168
5157
- // Descriptor Sets need their TLAS descriptors substituted if they've been compacted
5158
- // want to check if deps successfully exist
5159
- auto missingDependent = [&reservations]<Asset AssetType>(const typename asset_traits<AssetType>::video_t * dep)->bool
5160
- {
5161
- auto & stagingCache = std::get<SReserveResult::staging_cache_t <AssetType>>(reservations.m_stagingCaches );
5162
- auto found = stagingCache.find (const_cast <typename asset_traits<AssetType>::video_t *>(dep));
5163
- // this only checks if whether we had to convert and failed
5164
- if (found!=stagingCache.end () && found->second .value ==CHashCache::NoContentHash)
5165
- return true ;
5166
- // but the dependent might be in readCache of one or more converters, so if in doubt assume its okay
5167
- return false ;
5168
- };
5169
5169
// insert items into cache if overflows handled fine and commandbuffers ready to be recorded
5170
5170
auto mergeCache = [&]<Asset AssetType>()->void
5171
5171
{
@@ -5277,7 +5277,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
5277
5277
mergeCache.template operator ()<ICPUComputePipeline>();
5278
5278
mergeCache.template operator ()<ICPURenderpass>();
5279
5279
mergeCache.template operator ()<ICPUGraphicsPipeline>();
5280
- // write the TLASes into Descriptor Set finally
5280
+ // overwrite the compacted TLASes in Descriptor Sets
5281
5281
if (auto & tlasRewriteSet=reservations.m_potentialTLASRewrites ; !tlasRewriteSet.empty ())
5282
5282
{
5283
5283
core::vector<IGPUDescriptorSet::SWriteDescriptorSet> writes;
0 commit comments