Skip to content

Commit 69df18a

Browse files
author
devsh
committed
save progress before attempting to remove m_deferredTLASDescriptorWrites
1 parent 8555fad commit 69df18a

File tree

2 files changed

+81
-46
lines changed

2 files changed

+81
-46
lines changed

include/nbl/video/utilities/CAssetConverter.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1103,11 +1103,13 @@ class CAssetConverter : public core::IReferenceCounted
11031103
uint64_t scratchSize : 45;
11041104
uint64_t compact : 1;
11051105
uint64_t buildFlags : 16 = 0;
1106+
// scratch + input size also accounting for worst case padding due to alignment
1107+
uint64_t buildSize;
11061108
};
1107-
using SConvReqBLAS = SConvReqAccelerationStructure<asset::ICPUBottomLevelAccelerationStructure>;
1108-
core::unordered_map<IGPUBottomLevelAccelerationStructure*,SConvReqBLAS> m_blasConversions[2];
1109-
using SConvReqTLAS = SConvReqAccelerationStructure<asset::ICPUTopLevelAccelerationStructure>;
1110-
core::unordered_map<IGPUTopLevelAccelerationStructure*,SConvReqTLAS> m_tlasConversions[2];
1109+
template<typename CPUAccelerationStructure>
1110+
using SConvReqAccelerationStructureMap = core::unordered_map<typename asset_traits<CPUAccelerationStructure>::video_t*,SConvReqAccelerationStructure<CPUAccelerationStructure>>;
1111+
SConvReqAccelerationStructureMap<asset::ICPUBottomLevelAccelerationStructure> m_blasConversions[2];
1112+
SConvReqAccelerationStructureMap<asset::ICPUTopLevelAccelerationStructure> m_tlasConversions[2];
11111113

11121114
// array index 0 for device builds, 1 for host builds
11131115
uint64_t m_minASBuildScratchSize[2] = {0,0};

src/nbl/video/utilities/CAssetConverter.cpp

Lines changed: 75 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1654,10 +1654,6 @@ class GetDependantVisit;
16541654
template<>
16551655
class GetDependantVisit<ICPUTopLevelAccelerationStructure> : public GetDependantVisitBase<ICPUTopLevelAccelerationStructure>
16561656
{
1657-
public:
1658-
// TODO: deal with usages not going through because of cancelled TLAS builds, by gathering in a top-down pass at the end of `reserve`
1659-
CAssetConverter::SReserveResult::cpu_to_gpu_blas_map_t* blasBuildMap = nullptr;
1660-
16611657
protected:
16621658
bool descend_impl(
16631659
const instance_t<AssetType>& user, const CAssetConverter::patch_t<AssetType>& userPatch,
@@ -1668,16 +1664,6 @@ class GetDependantVisit<ICPUTopLevelAccelerationStructure> : public GetDependant
16681664
auto depObj = getDependant<ICPUBottomLevelAccelerationStructure>(dep,soloPatch);
16691665
if (!depObj)
16701666
return false;
1671-
if (blasBuildMap)
1672-
{
1673-
const auto instances = user.asset->getInstances();
1674-
assert(instanceIndex<instances.size());
1675-
auto foundBLAS = blasBuildMap->find(dep.asset);
1676-
if (foundBLAS!=blasBuildMap->end())
1677-
foundBLAS->second.remainingUsages++;
1678-
else
1679-
blasBuildMap->insert(foundBLAS,{dep.asset,{depObj}});
1680-
}
16811667
return true;
16821668
}
16831669
};
@@ -1958,9 +1944,13 @@ class GetDependantVisit<ICPUDescriptorSet> : public GetDependantVisitBase<ICPUDe
19581944
// the RLE will always finish a write because a single binding can only be a single descriptor type, important that the TLAS path happens after that check
19591945
if constexpr (std::is_same_v<DepType,ICPUTopLevelAccelerationStructure>)
19601946
{
1961-
const auto [where,inserted] =deferredTLASWrites.insert({binding.data,element,depObj});
1962-
assert(inserted);
1963-
return true;
1947+
// not built yet?
1948+
if (depObj->)
1949+
{
1950+
const auto [where,inserted] = deferredTLASWrites.insert({binding.data,element,depObj});
1951+
assert(inserted);
1952+
return true;
1953+
}
19641954
}
19651955
//
19661956
auto& outInfo = infos.emplace_back();
@@ -3420,19 +3410,16 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
34203410
bufferConversions.propagateToCaches(std::get<dfs_cache<ICPUBuffer>>(dfsCaches),std::get<SReserveResult::staging_cache_t<ICPUBuffer>>(retval.m_stagingCaches));
34213411
// Deal with Deferred Creation of Acceleration structures
34223412
{
3423-
const auto minScratchAlignment = device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment;
34243413
auto createAccelerationStructures = [&]<typename AccelerationStructure>()->void
34253414
{
34263415
constexpr bool IsTLAS = std::is_same_v<AccelerationStructure,ICPUTopLevelAccelerationStructure>;
3427-
// TLAS and BLAS can't build concurrently, index 0 is device build, 1 is host build
3428-
size_t scratchSizeFullParallelBuild[2] = {0,0};
34293416
//
3430-
core::unordered_map<typename asset_traits<AccelerationStructure>::video_t*,SReserveResult::SConvReqAccelerationStructure<AccelerationStructure>>* pConversions;
3417+
SReserveResult::SConvReqAccelerationStructureMap<AccelerationStructure>* pConversions;
34313418
if constexpr (IsTLAS)
34323419
pConversions = retval.m_tlasConversions;
34333420
else
34343421
pConversions = retval.m_blasConversions;
3435-
// we collect that stats AFTER making sure that the BLAS / TLAS can actually be created
3422+
// we enqueue the conversions AFTER making sure that the BLAS / TLAS can actually be created
34363423
for (size_t i=0; i<accelerationStructureParams[IsTLAS].size(); i++)
34373424
if (const auto& deferredParams=accelerationStructureParams[IsTLAS][i]; deferredParams.storage)
34383425
{
@@ -3454,7 +3441,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
34543441
{
34553442
// check if the BLASes we want to use for the instances were successfully allocated and created
34563443
AssetVisitor<GetDependantVisit<ICPUTopLevelAccelerationStructure>> visitor = {
3457-
{inputs,dfsCaches,&retval.m_blasBuildMap},
3444+
{inputs,dfsCaches},
34583445
{canonical,deferredParams.uniqueCopyGroupID},
34593446
patch
34603447
};
@@ -3483,23 +3470,13 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
34833470
request.scratchSize = deferredParams.scratchSize;
34843471
request.compact = patch.compactAfterBuild;
34853472
request.buildFlags = static_cast<uint16_t>(patch.getBuildFlags(canonical).value);
3486-
// sizes for building 1-by-1 vs parallel, note that BLAS and TLAS can't be built concurrently
3487-
retval.m_minASBuildScratchSize[patch.hostBuild] = core::max(retval.m_minASBuildScratchSize[patch.hostBuild],deferredParams.buildSize);
3488-
scratchSizeFullParallelBuild[patch.hostBuild] += deferredParams.buildSize;
3489-
// note that in order to compact an AS you need to allocate a buffer range whose size is known only after the build
3490-
if (patch.compactAfterBuild)
3491-
retval.m_compactedASMaxMemory += bufSz;
3473+
request.buildSize = deferredParams.buildSize;
34923474
}
3493-
retval.m_maxASBuildScratchSize[0] = core::max(scratchSizeFullParallelBuild[0],retval.m_maxASBuildScratchSize[0]);
3494-
retval.m_maxASBuildScratchSize[1] = core::max(scratchSizeFullParallelBuild[1],retval.m_maxASBuildScratchSize[1]);
34953475
};
34963476
createAccelerationStructures.template operator()<ICPUBottomLevelAccelerationStructure>();
34973477
blasConversions.propagateToCaches(std::get<dfs_cache<ICPUBottomLevelAccelerationStructure>>(dfsCaches),std::get<SReserveResult::staging_cache_t<ICPUBottomLevelAccelerationStructure>>(retval.m_stagingCaches));
34983478
createAccelerationStructures.template operator()<ICPUTopLevelAccelerationStructure>();
34993479
tlasConversions.propagateToCaches(std::get<dfs_cache<ICPUTopLevelAccelerationStructure>>(dfsCaches),std::get<SReserveResult::staging_cache_t<ICPUTopLevelAccelerationStructure>>(retval.m_stagingCaches));
3500-
//
3501-
if (retval.willDeviceASBuild() || retval.willCompactAS())
3502-
retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
35033480
}
35043481
// find out which images need what caps for the transfer and mipmapping
35053482
auto& dfsCacheImages = std::get<dfs_cache<ICPUImage>>(dfsCaches);
@@ -3580,18 +3557,19 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
35803557
if constexpr (std::is_same_v<AssetType,ICPUBuffer>)
35813558
retval.m_bufferConversions.erase(entry.first);
35823559
if constexpr (std::is_same_v<AssetType,ICPUBottomLevelAccelerationStructure>)
3583-
{
3584-
}
3560+
for (auto i=0; i<2; i++)
3561+
retval.m_blasConversions[i].erase(entry.first);
35853562
if constexpr (std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>)
3586-
{
3587-
}
3563+
for (auto i=0; i<2; i++)
3564+
retval.m_tlasConversions[i].erase(entry.first);
35883565
if constexpr (std::is_same_v<AssetType,ICPUImage>)
35893566
retval.m_imageConversions.erase(entry.first);
35903567
// because Descriptor Sets don't hold onto TLASes yet, we need to drop the TLASes in deferred descriptor writes
35913568
if constexpr (std::is_same_v<AssetType,ICPUDescriptorSet>)
35923569
retval.m_deferredTLASDescriptorWrites.erase(entry.first);
35933570
return true;
35943571
}
3572+
// still referenced, keep it around
35953573
return false;
35963574
}
35973575
);
@@ -3611,16 +3589,71 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
36113589
pruneStaging.template operator()<ICPUBufferView>();
36123590
pruneStaging.template operator()<ICPUImage>();
36133591
pruneStaging.template operator()<ICPUTopLevelAccelerationStructure>();
3614-
// go over
3592+
// go over future TLAS builds to gather used BLASes
3593+
for (auto i=0; i<2; i++)
3594+
for (const auto& req : retval.m_tlasConversions[i])
3595+
{
3596+
auto* const cpuTLAS = req.second.canonical.get();
3597+
assert(cpuTLAS);
3598+
for (const auto& instance : cpuTLAS->getInstances())
3599+
{
3600+
auto* const cpuBLAS = instance.getBase().blas.get();
3601+
auto foundBLAS = retval.m_blasBuildMap.find(cpuBLAS);
3602+
if (foundBLAS!=retval.m_blasBuildMap.end())
3603+
foundBLAS->second.remainingUsages++;
3604+
else
3605+
{
3606+
smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure> gpuBLAS;
3607+
// TODO
3608+
retval.m_blasBuildMap.insert(foundBLAS,{cpuBLAS,{std::move(gpuBLAS),1,1}});
3609+
}
3610+
}
3611+
}
36153612
pruneStaging.template operator()<ICPUBottomLevelAccelerationStructure>();
36163613
pruneStaging.template operator()<ICPUBuffer>();
36173614
}
36183615

3619-
// TODO: prune the conversion requests -> maybe change the conversion requests to unordered_map ?
3620-
36213616
// only now get the queue flags
36223617
{
36233618
using q_fam_f = IQueue::FAMILY_FLAGS;
3619+
// acceleration structures, get scratch size
3620+
auto computeAccelerationStructureScratchSizes = [device,&retval]<typename AccelerationStructure>()->void
3621+
{
3622+
constexpr bool IsTLAS = std::is_same_v<AccelerationStructure,ICPUTopLevelAccelerationStructure>;
3623+
const auto& limits = device->getPhysicalDevice()->getLimits();
3624+
const auto minScratchAlignment = limits.minAccelerationStructureScratchOffsetAlignment;
3625+
// index 0 is device build, 1 is host build
3626+
size_t scratchSizeFullParallelBuild[2] = {0,0};
3627+
//
3628+
const SReserveResult::SConvReqAccelerationStructureMap<AccelerationStructure>* pConversions;
3629+
if constexpr (IsTLAS)
3630+
pConversions = retval.m_tlasConversions;
3631+
else
3632+
pConversions = retval.m_blasConversions;
3633+
// we collect the stats AFTER making sure only needed TLAS and BLAS will be built
3634+
for (auto i=0; i<2; i++)
3635+
for (auto req : pConversions[i])
3636+
{
3637+
const auto buildSize = req.second.buildSize;
3638+
// sizes for building 1-by-1 vs parallel, note that BLAS and TLAS can't be built concurrently
3639+
retval.m_minASBuildScratchSize[i] = core::max(retval.m_minASBuildScratchSize[i],buildSize);
3640+
scratchSizeFullParallelBuild[i] = core::alignUp(scratchSizeFullParallelBuild[i],minScratchAlignment)+buildSize;
3641+
// note that in order to compact an AS you need to allocate a buffer range whose size is known only after the build
3642+
if (req.second.compact)
3643+
{
3644+
const auto asSize = req.first->getCreationParams().bufferRange.size;
3645+
assert(core::is_aligned_to(asSize,256));
3646+
retval.m_compactedASMaxMemory += asSize;
3647+
}
3648+
}
3649+
// TLAS and BLAS can't build concurrently
3650+
retval.m_maxASBuildScratchSize[0] = core::max(scratchSizeFullParallelBuild[0],retval.m_maxASBuildScratchSize[0]);
3651+
retval.m_maxASBuildScratchSize[1] = core::max(scratchSizeFullParallelBuild[1],retval.m_maxASBuildScratchSize[1]);
3652+
};
3653+
computeAccelerationStructureScratchSizes.template operator()<ICPUBottomLevelAccelerationStructure>();
3654+
computeAccelerationStructureScratchSizes.template operator()<ICPUTopLevelAccelerationStructure>();
3655+
if (retval.willDeviceASBuild() || retval.willCompactAS())
3656+
retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
36243657
// images are trickier, we can't finish iterating until all possible flags are there
36253658
for (auto it=retval.m_imageConversions.begin(); !retval.m_queueFlags.hasFlags(q_fam_f::TRANSFER_BIT|q_fam_f::COMPUTE_BIT|q_fam_f::GRAPHICS_BIT) && it!=retval.m_imageConversions.end(); it++)
36263659
{
@@ -3632,7 +3665,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
36323665
// Best effort guess, without actually looking at all regions
36333666
const auto& params = it->first->getCreationParameters();
36343667
// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdCopyBufferToImage.html#VUID-vkCmdCopyBufferToImage-commandBuffer-07739
3635-
if (isDepthOrStencilFormat(params.format) && (params.depthUsage | params.stencilUsage).hasFlags(IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT))
3668+
if (isDepthOrStencilFormat(params.format) && (params.depthUsage|params.stencilUsage).hasFlags(IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT))
36363669
retval.m_queueFlags |= IQueue::FAMILY_FLAGS::GRAPHICS_BIT;
36373670
if (it->second.recomputeMips)
36383671
retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;

0 commit comments

Comments
 (0)