@@ -1654,10 +1654,6 @@ class GetDependantVisit;
1654
1654
template <>
1655
1655
class GetDependantVisit <ICPUTopLevelAccelerationStructure> : public GetDependantVisitBase<ICPUTopLevelAccelerationStructure>
1656
1656
{
1657
- public:
1658
- // TODO: deal with usages not going through because of cancelled TLAS builds, by gathering in a top-down pass at the end of `reserve`
1659
- CAssetConverter::SReserveResult::cpu_to_gpu_blas_map_t * blasBuildMap = nullptr ;
1660
-
1661
1657
protected:
1662
1658
bool descend_impl (
1663
1659
const instance_t <AssetType>& user, const CAssetConverter::patch_t <AssetType>& userPatch,
@@ -1668,16 +1664,6 @@ class GetDependantVisit<ICPUTopLevelAccelerationStructure> : public GetDependant
1668
1664
auto depObj = getDependant<ICPUBottomLevelAccelerationStructure>(dep,soloPatch);
1669
1665
if (!depObj)
1670
1666
return false ;
1671
- if (blasBuildMap)
1672
- {
1673
- const auto instances = user.asset ->getInstances ();
1674
- assert (instanceIndex<instances.size ());
1675
- auto foundBLAS = blasBuildMap->find (dep.asset );
1676
- if (foundBLAS!=blasBuildMap->end ())
1677
- foundBLAS->second .remainingUsages ++;
1678
- else
1679
- blasBuildMap->insert (foundBLAS,{dep.asset ,{depObj}});
1680
- }
1681
1667
return true ;
1682
1668
}
1683
1669
};
@@ -1958,9 +1944,13 @@ class GetDependantVisit<ICPUDescriptorSet> : public GetDependantVisitBase<ICPUDe
1958
1944
// the RLE will always finish a write because a single binding can only be a single descriptor type, important that the TLAS path happens after that check
1959
1945
if constexpr (std::is_same_v<DepType,ICPUTopLevelAccelerationStructure>)
1960
1946
{
1961
- const auto [where,inserted] =deferredTLASWrites.insert ({binding.data ,element,depObj});
1962
- assert (inserted);
1963
- return true ;
1947
+ // not built yet?
1948
+ if (depObj->)
1949
+ {
1950
+ const auto [where,inserted] = deferredTLASWrites.insert ({binding.data ,element,depObj});
1951
+ assert (inserted);
1952
+ return true ;
1953
+ }
1964
1954
}
1965
1955
//
1966
1956
auto & outInfo = infos.emplace_back ();
@@ -3420,19 +3410,16 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
3420
3410
bufferConversions.propagateToCaches (std::get<dfs_cache<ICPUBuffer>>(dfsCaches),std::get<SReserveResult::staging_cache_t <ICPUBuffer>>(retval.m_stagingCaches ));
3421
3411
// Deal with Deferred Creation of Acceleration structures
3422
3412
{
3423
- const auto minScratchAlignment = device->getPhysicalDevice ()->getLimits ().minAccelerationStructureScratchOffsetAlignment ;
3424
3413
auto createAccelerationStructures = [&]<typename AccelerationStructure>()->void
3425
3414
{
3426
3415
constexpr bool IsTLAS = std::is_same_v<AccelerationStructure,ICPUTopLevelAccelerationStructure>;
3427
- // TLAS and BLAS can't build concurrently, index 0 is device build, 1 is host build
3428
- size_t scratchSizeFullParallelBuild[2 ] = {0 ,0 };
3429
3416
//
3430
- core::unordered_map< typename asset_traits<AccelerationStructure>:: video_t *, SReserveResult::SConvReqAccelerationStructure <AccelerationStructure> >* pConversions;
3417
+ SReserveResult::SConvReqAccelerationStructureMap <AccelerationStructure>* pConversions;
3431
3418
if constexpr (IsTLAS)
3432
3419
pConversions = retval.m_tlasConversions ;
3433
3420
else
3434
3421
pConversions = retval.m_blasConversions ;
3435
- // we collect that stats AFTER making sure that the BLAS / TLAS can actually be created
3422
+ // we enqueue the conversions AFTER making sure that the BLAS / TLAS can actually be created
3436
3423
for (size_t i=0 ; i<accelerationStructureParams[IsTLAS].size (); i++)
3437
3424
if (const auto & deferredParams=accelerationStructureParams[IsTLAS][i]; deferredParams.storage )
3438
3425
{
@@ -3454,7 +3441,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
3454
3441
{
3455
3442
// check if the BLASes we want to use for the instances were successfully allocated and created
3456
3443
AssetVisitor<GetDependantVisit<ICPUTopLevelAccelerationStructure>> visitor = {
3457
- {inputs,dfsCaches,&retval. m_blasBuildMap },
3444
+ {inputs,dfsCaches},
3458
3445
{canonical,deferredParams.uniqueCopyGroupID },
3459
3446
patch
3460
3447
};
@@ -3483,23 +3470,13 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
3483
3470
request.scratchSize = deferredParams.scratchSize ;
3484
3471
request.compact = patch.compactAfterBuild ;
3485
3472
request.buildFlags = static_cast <uint16_t >(patch.getBuildFlags (canonical).value );
3486
- // sizes for building 1-by-1 vs parallel, note that BLAS and TLAS can't be built concurrently
3487
- retval.m_minASBuildScratchSize [patch.hostBuild ] = core::max (retval.m_minASBuildScratchSize [patch.hostBuild ],deferredParams.buildSize );
3488
- scratchSizeFullParallelBuild[patch.hostBuild ] += deferredParams.buildSize ;
3489
- // note that in order to compact an AS you need to allocate a buffer range whose size is known only after the build
3490
- if (patch.compactAfterBuild )
3491
- retval.m_compactedASMaxMemory += bufSz;
3473
+ request.buildSize = deferredParams.buildSize ;
3492
3474
}
3493
- retval.m_maxASBuildScratchSize [0 ] = core::max (scratchSizeFullParallelBuild[0 ],retval.m_maxASBuildScratchSize [0 ]);
3494
- retval.m_maxASBuildScratchSize [1 ] = core::max (scratchSizeFullParallelBuild[1 ],retval.m_maxASBuildScratchSize [1 ]);
3495
3475
};
3496
3476
createAccelerationStructures.template operator ()<ICPUBottomLevelAccelerationStructure>();
3497
3477
blasConversions.propagateToCaches (std::get<dfs_cache<ICPUBottomLevelAccelerationStructure>>(dfsCaches),std::get<SReserveResult::staging_cache_t <ICPUBottomLevelAccelerationStructure>>(retval.m_stagingCaches ));
3498
3478
createAccelerationStructures.template operator ()<ICPUTopLevelAccelerationStructure>();
3499
3479
tlasConversions.propagateToCaches (std::get<dfs_cache<ICPUTopLevelAccelerationStructure>>(dfsCaches),std::get<SReserveResult::staging_cache_t <ICPUTopLevelAccelerationStructure>>(retval.m_stagingCaches ));
3500
- //
3501
- if (retval.willDeviceASBuild () || retval.willCompactAS ())
3502
- retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
3503
3480
}
3504
3481
// find out which images need what caps for the transfer and mipmapping
3505
3482
auto & dfsCacheImages = std::get<dfs_cache<ICPUImage>>(dfsCaches);
@@ -3580,18 +3557,19 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
3580
3557
if constexpr (std::is_same_v<AssetType,ICPUBuffer>)
3581
3558
retval.m_bufferConversions .erase (entry.first );
3582
3559
if constexpr (std::is_same_v<AssetType,ICPUBottomLevelAccelerationStructure>)
3583
- {
3584
- }
3560
+ for ( auto i= 0 ; i< 2 ; i++)
3561
+ retval. m_blasConversions [i]. erase (entry. first );
3585
3562
if constexpr (std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>)
3586
- {
3587
- }
3563
+ for ( auto i= 0 ; i< 2 ; i++)
3564
+ retval. m_tlasConversions [i]. erase (entry. first );
3588
3565
if constexpr (std::is_same_v<AssetType,ICPUImage>)
3589
3566
retval.m_imageConversions .erase (entry.first );
3590
3567
// because Descriptor Sets don't hold onto TLASes yet, we need to drop the TLASes in deferred descriptor writes
3591
3568
if constexpr (std::is_same_v<AssetType,ICPUDescriptorSet>)
3592
3569
retval.m_deferredTLASDescriptorWrites .erase (entry.first );
3593
3570
return true ;
3594
3571
}
3572
+ // still referenced, keep it around
3595
3573
return false ;
3596
3574
}
3597
3575
);
@@ -3611,16 +3589,71 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
3611
3589
pruneStaging.template operator ()<ICPUBufferView>();
3612
3590
pruneStaging.template operator ()<ICPUImage>();
3613
3591
pruneStaging.template operator ()<ICPUTopLevelAccelerationStructure>();
3614
- // go over
3592
+ // go over future TLAS builds to gather used BLASes
3593
+ for (auto i=0 ; i<2 ; i++)
3594
+ for (const auto & req : retval.m_tlasConversions [i])
3595
+ {
3596
+ auto * const cpuTLAS = req.second .canonical .get ();
3597
+ assert (cpuTLAS);
3598
+ for (const auto & instance : cpuTLAS->getInstances ())
3599
+ {
3600
+ auto * const cpuBLAS = instance.getBase ().blas .get ();
3601
+ auto foundBLAS = retval.m_blasBuildMap .find (cpuBLAS);
3602
+ if (foundBLAS!=retval.m_blasBuildMap .end ())
3603
+ foundBLAS->second .remainingUsages ++;
3604
+ else
3605
+ {
3606
+ smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure> gpuBLAS;
3607
+ // TODO
3608
+ retval.m_blasBuildMap .insert (foundBLAS,{cpuBLAS,{std::move (gpuBLAS),1 ,1 }});
3609
+ }
3610
+ }
3611
+ }
3615
3612
pruneStaging.template operator ()<ICPUBottomLevelAccelerationStructure>();
3616
3613
pruneStaging.template operator ()<ICPUBuffer>();
3617
3614
}
3618
3615
3619
- // TODO: prune the conversion requests -> maybe change the conversion requests to unordered_map ?
3620
-
3621
3616
// only now get the queue flags
3622
3617
{
3623
3618
using q_fam_f = IQueue::FAMILY_FLAGS;
3619
+ // acceleration structures, get scratch size
3620
+ auto computeAccelerationStructureScratchSizes = [device,&retval]<typename AccelerationStructure>()->void
3621
+ {
3622
+ constexpr bool IsTLAS = std::is_same_v<AccelerationStructure,ICPUTopLevelAccelerationStructure>;
3623
+ const auto & limits = device->getPhysicalDevice ()->getLimits ();
3624
+ const auto minScratchAlignment = limits.minAccelerationStructureScratchOffsetAlignment ;
3625
+ // index 0 is device build, 1 is host build
3626
+ size_t scratchSizeFullParallelBuild[2 ] = {0 ,0 };
3627
+ //
3628
+ const SReserveResult::SConvReqAccelerationStructureMap<AccelerationStructure>* pConversions;
3629
+ if constexpr (IsTLAS)
3630
+ pConversions = retval.m_tlasConversions ;
3631
+ else
3632
+ pConversions = retval.m_blasConversions ;
3633
+ // we collect the stats AFTER making sure only needed TLAS and BLAS will be built
3634
+ for (auto i=0 ; i<2 ; i++)
3635
+ for (auto req : pConversions[i])
3636
+ {
3637
+ const auto buildSize = req.second .buildSize ;
3638
+ // sizes for building 1-by-1 vs parallel, note that BLAS and TLAS can't be built concurrently
3639
+ retval.m_minASBuildScratchSize [i] = core::max (retval.m_minASBuildScratchSize [i],buildSize);
3640
+ scratchSizeFullParallelBuild[i] = core::alignUp (scratchSizeFullParallelBuild[i],minScratchAlignment)+buildSize;
3641
+ // note that in order to compact an AS you need to allocate a buffer range whose size is known only after the build
3642
+ if (req.second .compact )
3643
+ {
3644
+ const auto asSize = req.first ->getCreationParams ().bufferRange .size ;
3645
+ assert (core::is_aligned_to (asSize,256 ));
3646
+ retval.m_compactedASMaxMemory += asSize;
3647
+ }
3648
+ }
3649
+ // TLAS and BLAS can't build concurrently
3650
+ retval.m_maxASBuildScratchSize [0 ] = core::max (scratchSizeFullParallelBuild[0 ],retval.m_maxASBuildScratchSize [0 ]);
3651
+ retval.m_maxASBuildScratchSize [1 ] = core::max (scratchSizeFullParallelBuild[1 ],retval.m_maxASBuildScratchSize [1 ]);
3652
+ };
3653
+ computeAccelerationStructureScratchSizes.template operator ()<ICPUBottomLevelAccelerationStructure>();
3654
+ computeAccelerationStructureScratchSizes.template operator ()<ICPUTopLevelAccelerationStructure>();
3655
+ if (retval.willDeviceASBuild () || retval.willCompactAS ())
3656
+ retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
3624
3657
// images are trickier, we can't finish iterating until all possible flags are there
3625
3658
for (auto it=retval.m_imageConversions .begin (); !retval.m_queueFlags .hasFlags (q_fam_f::TRANSFER_BIT|q_fam_f::COMPUTE_BIT|q_fam_f::GRAPHICS_BIT) && it!=retval.m_imageConversions .end (); it++)
3626
3659
{
@@ -3632,7 +3665,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
3632
3665
// Best effort guess, without actually looking at all regions
3633
3666
const auto & params = it->first ->getCreationParameters ();
3634
3667
// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdCopyBufferToImage.html#VUID-vkCmdCopyBufferToImage-commandBuffer-07739
3635
- if (isDepthOrStencilFormat (params.format ) && (params.depthUsage | params.stencilUsage ).hasFlags (IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT))
3668
+ if (isDepthOrStencilFormat (params.format ) && (params.depthUsage | params.stencilUsage ).hasFlags (IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT))
3636
3669
retval.m_queueFlags |= IQueue::FAMILY_FLAGS::GRAPHICS_BIT;
3637
3670
if (it->second .recomputeMips )
3638
3671
retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
0 commit comments