Skip to content

Commit 26ba094

Browse files
author
devsh
committed
release canonical assets ASAP, and properly fill the BLAS tracking data for the TLAS build into
1 parent 3ca6219 commit 26ba094

File tree

2 files changed

+56
-26
lines changed

2 files changed

+56
-26
lines changed

include/nbl/video/utilities/CAssetConverter.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1094,7 +1094,14 @@ class CAssetConverter : public core::IReferenceCounted
10941094
// We do all compactions on the Device for simplicity
10951095
uint8_t m_willCompactSomeAS : 1 = false;
10961096
// This tracks non-root BLASes which are needed for a subsequent TLAS build. Note that even things which are NOT in the staging cache are tracked here to make sure they don't finish their lifetimes early.
1097-
using cpu_to_gpu_blas_map_t = core::unordered_map<const asset::ICPUBottomLevelAccelerationStructure*,asset_cached_t<asset::ICPUBottomLevelAccelerationStructure>>;
1097+
struct BLASUsedInTLASBuild
1098+
{
1099+
// This is the BLAS meant to be used for the instance, note that compaction of a BLAS overwrites the initial values at the end of `reserve`
1100+
core::smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure> gpuBLAS;
1101+
// internal micro-refcount which lets us know when we should remove the entry from the map below
1102+
uint32_t remainingUsages = 0;
1103+
};
1104+
using cpu_to_gpu_blas_map_t = core::unordered_map<const asset::ICPUBottomLevelAccelerationStructure*,BLASUsedInTLASBuild>;
10981105
cpu_to_gpu_blas_map_t m_blasBuildMap;
10991106

11001107
//

src/nbl/video/utilities/CAssetConverter.cpp

Lines changed: 48 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -3604,8 +3604,10 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
36043604
outputReverseMap[gpuObj.value.get()] = i++;
36053605
}
36063606
);
3607-
auto markFailureInStaging = [&reservations,&outputReverseMap,logger]<Asset AssetType>(const char* message, const asset_traits<AssetType>::video_t* gpuObj, core::blake3_hash_t* hash)->void
3607+
auto markFailureInStaging = [&reservations,&outputReverseMap,logger]<Asset AssetType>(const char* message, smart_refctd_ptr<const AssetType>& canonical, const asset_traits<AssetType>::video_t* gpuObj, core::blake3_hash_t* hash)->void
36083608
{
3609+
// wipe the smart pointer to the canonical, make sure we release that memory ASAP if no other user is around
3610+
canonical = nullptr;
36093611
logger.log("%s failed for \"%s\"",system::ILogger::ELL_ERROR,message,gpuObj->getObjectDebugName());
36103612
// change the content hash on the reverse map to a NoContentHash
36113613
*hash = CHashCache::NoContentHash;
@@ -3695,13 +3697,13 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
36953697
success = success && params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,item.canonical->getPointer());
36963698
// current recording buffer may have changed
36973699
xferCmdBuf = params.transfer->getCommandBufferForRecording();
3698-
// let go of canonical asset (may free RAM)
3699-
item.canonical = nullptr;
37003700
if (!success)
37013701
{
3702-
markFailureInStaging.operator()<ICPUBuffer>("Data Upload",buffer,pFoundHash);
3702+
markFailureInStaging("Data Upload",item.canonical,buffer,pFoundHash);
37033703
continue;
37043704
}
3705+
// let go of canonical asset (may free RAM)
3706+
item.canonical = nullptr;
37053707
submitsNeeded |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
37063708
// enqueue ownership release if necessary
37073709
if (ownerQueueFamily!=IQueue::FamilyIgnored)
@@ -3899,7 +3901,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
38993901
}
39003902
if (!quickWriteDescriptor(SrcMipBinding,srcIx,std::move(srcView)))
39013903
{
3902-
markFailureInStaging.operator()<ICPUImage>("Source Mip Level Descriptor Write",image,pFoundHash);
3904+
markFailureInStaging("Source Mip Level Descriptor Write",item.canonical,image,pFoundHash);
39033905
continue;
39043906
}
39053907
}
@@ -4152,7 +4154,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
41524154
const auto oldImmediateSubmitSignalValue = params.transfer->scratchSemaphore.value;
41534155
if (!params.utilities->updateImageViaStagingBuffer(*params.transfer,cpuImg->getBuffer()->getPointer(),cpuImg->getCreationParameters().format,image,tmp.newLayout,regions))
41544156
{
4155-
logger.log("Image Redion Upload failed!", system::ILogger::ELL_ERROR);
4157+
logger.log("Image Region Upload failed!", system::ILogger::ELL_ERROR);
41564158
break;
41574159
}
41584160
// stall callback is only called if multiple buffering of scratch commandbuffers fails, we also want to submit compute if transfer was submitted
@@ -4207,16 +4209,18 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
42074209
// failed in the for-loop
42084210
if (lvl != creationParams.mipLevels)
42094211
{
4210-
markFailureInStaging.operator()<ICPUImage>("Compute Mip Mapping",image,pFoundHash);
4212+
markFailureInStaging("Compute Mip Mapping",item.canonical,image,pFoundHash);
42114213
continue;
42124214
}
4215+
// let go of canonical asset (may free RAM)
4216+
item.canonical = nullptr;
42134217
}
42144218
// here we only record barriers that do final layout transitions and release ownership to final queue family
42154219
if (!transferBarriers.empty())
42164220
{
42174221
if (!pipelineBarrier(xferCmdBuf,{.memBarriers={},.bufBarriers={},.imgBarriers=transferBarriers},"Final Pipeline Barrier recording to Transfer Command Buffer failed"))
42184222
{
4219-
markFailureInStaging.operator()<ICPUImage>("Image Data Upload Pipeline Barrier",image,pFoundHash);
4223+
markFailureInStaging("Image Data Upload Pipeline Barrier",item.canonical,image,pFoundHash);
42204224
continue;
42214225
}
42224226
submitsNeeded |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
@@ -4226,7 +4230,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
42264230
dsAlloc->multi_deallocate(SrcMipBinding,1,&srcIx,params.compute->getFutureScratchSemaphore());
42274231
if (!pipelineBarrier(computeCmdBuf,{.memBarriers={},.bufBarriers={},.imgBarriers=computeBarriers},"Final Pipeline Barrier recording to Compute Command Buffer failed"))
42284232
{
4229-
markFailureInStaging.operator()<ICPUImage>("Compute Mip Mapping Pipeline Barrier",image,pFoundHash);
4233+
markFailureInStaging("Compute Mip Mapping Pipeline Barrier",item.canonical,image,pFoundHash);
42304234
continue;
42314235
}
42324236
}
@@ -4244,8 +4248,6 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
42444248
{
42454249
// we release BLAS and TLAS Storage Buffer ownership at the same time, because BLASes about to be released may need to be read by TLAS builds
42464250
core::vector<buffer_mem_barrier_t> ownershipTransfers;
4247-
// the already compacted BLASes need to be written into the TLASes using them
4248-
core::unordered_map<IGPUBottomLevelAccelerationStructure*,smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>> compactedBLASMap;
42494251

42504252
// Device Builds
42514253
auto& blasesToBuild = reservations.m_blasConversions[0];
@@ -4271,7 +4273,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
42714273
// Device BLAS builds
42724274
if (blasCount)
42734275
{
4274-
compactedBLASMap.reserve(blasCount);
4276+
// build
42754277
#ifdef NBL_ACCELERATION_STRUCTURE_CONVERSION
42764278
constexpr auto GeometryIsAABBFlag = ICPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT;
42774279

@@ -4303,7 +4305,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
43034305
//
43044306
if (!device->buildAccelerationStructure(dOp.get(),info,range))
43054307
{
4306-
markFailureInStaging.operator()<ICPUBottomLevelAccelerationStructure>("BLAS Build Command Recording",gpuObj,pFoundHash);
4308+
markFailureInStaging("BLAS Build Command Recording",item.canonical,gpuObj,pFoundHash);
43074309
continue;
43084310
}
43094311
}
@@ -4324,6 +4326,11 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
43244326
}
43254327
}
43264328
#endif
4329+
// compact
4330+
{
4331+
// the already compacted BLASes need to be written into the TLASes using them, want to swap them out ASAP
4332+
//reservations.m_blasBuildMap[canonical].gpuBLAS = compacted;
4333+
}
43274334
blasesToBuild.clear();
43284335
}
43294336

@@ -4350,7 +4357,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
43504357
//
43514358
core::vector<IGPUTopLevelAccelerationStructure::DeviceBuildInfo> buildInfos;
43524359
buildInfos.reserve(tlasCount);
4353-
core::vector<const IGPUBottomLevelAccelerationStructure*> trackedBLASes;
4360+
core::vector<smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>> trackedBLASes;
43544361
trackedBLASes.reserve(hlsl::max(tlasCount,blasCount));
43554362
core::vector<IGPUTopLevelAccelerationStructure::BuildRangeInfo> rangeInfos;
43564363
rangeInfos.reserve(tlasCount);
@@ -4362,14 +4369,16 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
43624369
for (auto& info : buildInfos)
43634370
{
43644371
const auto offset = info.trackedBLASes.data();
4365-
info.trackedBLASes = {trackedBLASes.data()+reinterpret_cast<const size_t&>(offset),info.trackedBLASes.size()};
4372+
const auto correctPtr = trackedBLASes.data()+reinterpret_cast<const size_t&>(offset);
4373+
info.trackedBLASes = {reinterpret_cast<const IGPUBottomLevelAccelerationStructure** const&>(correctPtr),info.trackedBLASes.size()};
43664374
}
43674375
//
43684376
if (!computeCmdBuf->cmdbuf->buildAccelerationStructures({buildInfos},rangeInfos.data()))
43694377
for (const auto& info : buildInfos)
43704378
{
43714379
const auto pFoundHash = findInStaging.operator()<ICPUTopLevelAccelerationStructure>(info.dstAS);
4372-
markFailureInStaging.operator()<ICPUTopLevelAccelerationStructure>("TLAS Build Command Recording",info.dstAS,pFoundHash); // TODO: make messages configurable message
4380+
smart_refctd_ptr<const ICPUTopLevelAccelerationStructure> dummy; // already null at this point
4381+
markFailureInStaging("TLAS Build Command Recording",dummy,info.dstAS,pFoundHash);
43734382
}
43744383
buildInfos.clear();
43754384
rangeInfos.clear();
@@ -4379,8 +4388,11 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
43794388
using scratch_allocator_t = std::remove_reference_t<decltype(*params.scratchForDeviceASBuild)>;
43804389
using addr_t = typename scratch_allocator_t::size_type;
43814390
const auto& limits = physDev->getLimits();
4382-
for (const auto& tlasToBuild : tlasesToBuild)
4391+
core::unordered_set<smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>> dedupBLASesUsed;
4392+
dedupBLASesUsed.reserve(reservations.m_blasBuildMap.size());
4393+
for (auto& tlasToBuild : tlasesToBuild)
43834394
{
4395+
dedupBLASesUsed.clear();
43844396
const auto as = tlasToBuild.gpuObj;
43854397
const auto pFoundHash = findInStaging.operator()<ICPUTopLevelAccelerationStructure>(as);
43864398
const auto instances = tlasToBuild.canonical->getInstances();
@@ -4399,7 +4411,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
43994411
// problem with finding the dependents (BLASes)
44004412
if (instanceDataSize==0)
44014413
{
4402-
markFailureInStaging.operator()<ICPUTopLevelAccelerationStructure>("Finding Dependant GPU BLASes for TLAS build",as,pFoundHash);
4414+
markFailureInStaging("Finding Dependant GPU BLASes for TLAS build",tlasToBuild.canonical,as,pFoundHash);
44034415
continue;
44044416
}
44054417
// allocate scratch and build inputs
@@ -4451,19 +4463,26 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
44514463
const auto newWritten = bytesWritten+size;
44524464
if (newWritten>=blockSize)
44534465
return bytesWritten;
4454-
auto blas = blasBuildMap->find(instance.getBase().blas.get())->second;
4466+
auto found = blasBuildMap->find(instance.getBase().blas.get());
4467+
assert(found!=blasBuildMap.end());
4468+
const auto& blas = found->second.gpuBLAS;
44554469
dst = IGPUTopLevelAccelerationStructure::writeInstance(dst,instance,blas.get()->getReferenceForDeviceOperations());
4470+
dedupBLASesUsed->emplace(blas);
4471+
if (--found->second.remainingUsages == 0)
4472+
blasBuildMap->erase(found);
44564473
bytesWritten = newWritten;
44574474
}
44584475
}
44594476

4477+
SReserveResult::cpu_to_gpu_blas_map_t* blasBuildMap;
4478+
core::unordered_set<smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>>* dedupBLASesUsed;
44604479
std::span<const ICPUTopLevelAccelerationStructure::PolymorphicInstance> instances;
4461-
const SReserveResult::cpu_to_gpu_blas_map_t* blasBuildMap;
44624480
uint32_t instanceIndex = 0;
44634481
};
44644482
FillInstances fillInstances;
4465-
fillInstances.instances = instances;
44664483
fillInstances.blasBuildMap = &reservations.m_blasBuildMap;
4484+
fillInstances.dedupBLASesUsed = &dedupBLASesUsed;
4485+
fillInstances.instances = instances;
44674486
success = success && params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,fillInstances);
44684487
}
44694488
if (as->usesMotion())
@@ -4501,9 +4520,11 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
45014520
xferCmdBuf = params.transfer->getCommandBufferForRecording();
45024521
if (!success)
45034522
{
4504-
markFailureInStaging.operator()<ICPUTopLevelAccelerationStructure>("Uploading Instance Data for TLAS build failed",as,pFoundHash);
4523+
markFailureInStaging("Uploading Instance Data for TLAS build failed",tlasToBuild.canonical,as,pFoundHash);
45054524
continue;
45064525
}
4526+
// let go of canonical asset (may free RAM)
4527+
tlasToBuild.canonical = nullptr;
45074528
}
45084529
// prepare build infos
45094530
auto& buildInfo = buildInfos.emplace_back();
@@ -4517,16 +4538,18 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
45174538
{
45184539
const auto offset = trackedBLASes.size();
45194540
using p_p_BLAS_t = const IGPUBottomLevelAccelerationStructure**;
4520-
buildInfo.trackedBLASes = {reinterpret_cast<const p_p_BLAS_t&>(offset),instanceCount};
4541+
buildInfo.trackedBLASes = {reinterpret_cast<const p_p_BLAS_t&>(offset),dedupBLASesUsed.size()};
4542+
for (auto& blas : dedupBLASesUsed)
4543+
trackedBLASes.emplace_back(std::move(blas));
4544+
45214545
}
45224546
// no special extra byte offset into the instance buffer
45234547
rangeInfos.emplace_back(instanceCount,0u);
45244548
}
4549+
// finish the last batch
45254550
recordBuildCommands();
45264551
computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact TLASes END");
45274552
computeCmdBuf->cmdbuf->endDebugMarker();
4528-
// no longer need this info
4529-
compactedBLASMap.clear();
45304553
}
45314554
// compact
45324555
computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact TLASes START");

0 commit comments

Comments
 (0)