Skip to content

Commit 8d30513

Browse files
author
devsh
committed
add much needed lifetime tracking to BLASes
Check Scratch Allocator's max servicable alignment, and add configurable error messages to `markFailureInStaging`
1 parent 16fb038 commit 8d30513

File tree

2 files changed

+114
-36
lines changed

2 files changed

+114
-36
lines changed

include/nbl/video/utilities/CAssetConverter.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1083,14 +1083,18 @@ class CAssetConverter : public core::IReferenceCounted
10831083
uint64_t compactedASWriteOffset : 48 = WontCompact;
10841084
uint64_t buildFlags : 16 = static_cast<uint16_t>(build_f::NONE);
10851085
};
1086-
core::vector<SConvReqAccelerationStructure<asset::ICPUBottomLevelAccelerationStructure>> m_blasConversions[2];
1087-
core::vector<SConvReqAccelerationStructure<asset::ICPUTopLevelAccelerationStructure>> m_tlasConversions[2];
1086+
using SConvReqBLAS = SConvReqAccelerationStructure<asset::ICPUBottomLevelAccelerationStructure>;
1087+
core::vector<SConvReqBLAS> m_blasConversions[2];
1088+
using SConvReqTLAS = SConvReqAccelerationStructure<asset::ICPUTopLevelAccelerationStructure>;
1089+
core::vector<SConvReqTLAS> m_tlasConversions[2];
10881090

10891091
// 0 for device builds, 1 for host builds
10901092
uint64_t m_minASBuildScratchSize[2] = {0,0};
10911093
uint64_t m_maxASBuildScratchSize[2] = {0,0};
10921094
// We do all compactions on the Device for simplicity
10931095
uint8_t m_willCompactSomeAS : 1 = false;
1096+
// This tracks non-root BLASes which are needed for a subsequent TLAS build. Note that even things which are NOT in the staging cache are tracked here to make sure they don't finish their lifetimes early.
1097+
core::unordered_map<const asset::ICPUBottomLevelAccelerationStructure*,asset_cached_t<asset::ICPUBottomLevelAccelerationStructure>> m_blasBuildMap;
10941098

10951099
//
10961100
core::bitflag<IQueue::FAMILY_FLAGS> m_queueFlags = IQueue::FAMILY_FLAGS::NONE;

src/nbl/video/utilities/CAssetConverter.cpp

Lines changed: 108 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -3567,6 +3567,12 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
35673567
logger.log("Acceleration Structure Scratch Device Memory Allocator not large enough!",system::ILogger::ELL_ERROR);
35683568
return retval;
35693569
}
3570+
const auto minScratchAlignment = device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment;
3571+
if (addrAlloc.max_alignment()<minScratchAlignment)
3572+
{
3573+
logger.log("Accceleration Structure Scratch Device Memory Allocator cannot allocate with Physical Device's minimum required AS-build scratch alignment %u",system::ILogger::ELL_ERROR,minScratchAlignment);
3574+
return retval;
3575+
}
35703576
}
35713577
// the elusive and exotic host builds
35723578
if (reservations.willHostASBuild() && !params.scratchForHostASBuild)
@@ -3590,9 +3596,9 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
35903596
return const_cast<core::blake3_hash_t*>(&found->second.value);
35913597
};
35923598
// wipe gpu item in staging cache (this may drop it as well if it was made for only a root asset == no users)
3593-
auto markFailureInStaging = [logger](auto* gpuObj, core::blake3_hash_t* hash)->void
3599+
auto markFailureInStaging = [logger](const char* message, auto* gpuObj, core::blake3_hash_t* hash)->void
35943600
{
3595-
logger.log("Data upload failed for \"%s\"",system::ILogger::ELL_ERROR,gpuObj->getObjectDebugName());
3601+
logger.log("%s failed for \"%s\"",system::ILogger::ELL_ERROR,message,gpuObj->getObjectDebugName());
35963602
// change the content hash on the reverse map to a NoContentHash
35973603
*hash = CHashCache::NoContentHash;
35983604
};
@@ -3677,7 +3683,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
36773683
item.canonical = nullptr;
36783684
if (!success)
36793685
{
3680-
markFailureInStaging(buffer,pFoundHash);
3686+
markFailureInStaging("Data Upload",buffer,pFoundHash);
36813687
continue;
36823688
}
36833689
submitsNeeded |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
@@ -3877,7 +3883,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
38773883
}
38783884
if (!quickWriteDescriptor(SrcMipBinding,srcIx,std::move(srcView)))
38793885
{
3880-
markFailureInStaging(image,pFoundHash);
3886+
markFailureInStaging("Source Mip Level Descriptor Write",image,pFoundHash);
38813887
continue;
38823888
}
38833889
}
@@ -4185,7 +4191,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
41854191
// failed in the for-loop
41864192
if (lvl != creationParams.mipLevels)
41874193
{
4188-
markFailureInStaging(image, pFoundHash);
4194+
markFailureInStaging("Compute Mip Mapping",image,pFoundHash);
41894195
continue;
41904196
}
41914197
}
@@ -4194,7 +4200,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
41944200
{
41954201
if (!pipelineBarrier(xferCmdBuf,{.memBarriers={},.bufBarriers={},.imgBarriers=transferBarriers},"Final Pipeline Barrier recording to Transfer Command Buffer failed"))
41964202
{
4197-
markFailureInStaging(image, pFoundHash);
4203+
markFailureInStaging("Image Data Upload Pipeline Barrier",image,pFoundHash);
41984204
continue;
41994205
}
42004206
submitsNeeded |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
@@ -4204,7 +4210,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
42044210
dsAlloc->multi_deallocate(SrcMipBinding,1,&srcIx,params.compute->getFutureScratchSemaphore());
42054211
if (!pipelineBarrier(computeCmdBuf,{.memBarriers={},.bufBarriers={},.imgBarriers=computeBarriers},"Final Pipeline Barrier recording to Compute Command Buffer failed"))
42064212
{
4207-
markFailureInStaging(image,pFoundHash);
4213+
markFailureInStaging("Compute Mip Mapping Pipeline Barrier",image,pFoundHash);
42084214
continue;
42094215
}
42104216
}
@@ -4281,7 +4287,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
42814287
//
42824288
if (!device->buildAccelerationStructure(dOp.get(),info,range))
42834289
{
4284-
markFailureInStaging(gpuObj,pFoundHash);
4290+
markFailureInStaging("BLAS Build Command Recording",gpuObj,pFoundHash);
42854291
continue;
42864292
}
42874293
}
@@ -4332,7 +4338,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
43324338
trackedBLASes.reserve(hlsl::max(tlasCount,blasCount));
43334339
core::vector<IGPUTopLevelAccelerationStructure::BuildRangeInfo> rangeInfos;
43344340
rangeInfos.reserve(tlasCount);
4335-
auto recordBuilds = [&]()->void
4341+
auto recordBuildCommands = [&]()->void
43364342
{
43374343
if (buildInfos.empty())
43384344
return;
@@ -4347,7 +4353,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
43474353
for (const auto& info : buildInfos)
43484354
{
43494355
const auto pFoundHash = findInStaging.operator()<ICPUTopLevelAccelerationStructure>(info.dstAS);
4350-
markFailureInStaging(info.dstAS,pFoundHash); // TODO: make messages configurable message
4356+
markFailureInStaging("TLAS Build Command Recording",info.dstAS,pFoundHash); // TODO: make messages configurable message
43514357
}
43524358
buildInfos.clear();
43534359
rangeInfos.clear();
@@ -4363,49 +4369,118 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
43634369
const auto pFoundHash = findInStaging.operator()<ICPUTopLevelAccelerationStructure>(as);
43644370
const auto instances = tlasToBuild.canonical->getInstances();
43654371
const auto instanceCount = static_cast<uint32_t>(instances.size());
4366-
const auto instanceSize = true ? sizeof(IGPUTopLevelAccelerationStructure::DevicePolymorphicInstance):sizeof(IGPUTopLevelAccelerationStructure::DevicePolymorphicInstance);
4367-
// allocate scratch and build inputs
4368-
addr_t offsets[2] = {scratch_allocator_t::invalid_value,scratch_allocator_t::invalid_value};
4372+
size_t instanceDataSize = 0;
4373+
// gather total input size and check dependants exist
4374+
for (const auto& instance : instances)
43694375
{
4370-
const addr_t sizes[2] = {tlasToBuild.scratchSize,instanceSize*instanceCount};
4371-
const addr_t alignments[2] = {limits.minAccelerationStructureScratchOffsetAlignment,8}; // TODO: check address allocator can service these alignments
4372-
const size_t worstSize = core::alignUp(sizes[0],alignments[1])+sizes[1];
4373-
// it will never fit (prevent CPU hangs)
4374-
if (const auto& addrAlloc=params.scratchForDeviceASBuild->getAddressAllocator(); addrAlloc.get_free_size()+addrAlloc.get_allocated_size()<worstSize)
4376+
if (reservations.m_blasBuildMap.find(instance.getBase().blas.get())==reservations.m_blasBuildMap.end())
43754377
{
4376-
markFailureInStaging(as,pFoundHash);
4377-
continue;
4378+
instanceDataSize = 0;
4379+
break;
4380+
}
4381+
using instance_type_t = ICPUTopLevelAccelerationStructure::INSTANCE_TYPE;
4382+
switch (instance.getType())
4383+
{
4384+
case instance_type_t::SRT_MOTION:
4385+
instanceDataSize += sizeof(IGPUTopLevelAccelerationStructure::DeviceSRTMotionInstance);
4386+
break;
4387+
case instance_type_t::MATRIX_MOTION:
4388+
instanceDataSize += sizeof(IGPUTopLevelAccelerationStructure::DeviceMatrixMotionInstance);
4389+
break;
4390+
default:
4391+
instanceDataSize += sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance);
4392+
break;
43784393
}
4394+
}
4395+
// problem with finding the dependents (BLASes)
4396+
if (instanceDataSize==0)
4397+
{
4398+
markFailureInStaging("Finding Dependant GPU BLASes for TLAS build",as,pFoundHash);
4399+
continue;
4400+
}
4401+
// allocate scratch and build inputs
4402+
constexpr uint32_t AllocCount = 3;
4403+
addr_t offsets[3] = {scratch_allocator_t::invalid_value,scratch_allocator_t::invalid_value,scratch_allocator_t::invalid_value};
4404+
const addr_t sizes[AllocCount] = {tlasToBuild.scratchSize,instanceDataSize,sizeof(void*)*instanceCount};
4405+
{
4406+
const addr_t alignments[AllocCount] = {limits.minAccelerationStructureScratchOffsetAlignment,16,8};
4407+
/* TODO: move to reserve phase - prevent CPU hangs by making sure allocator big enough to service us
4408+
{
4409+
addr_t worstSize = sizes[0];
4410+
for (auto i=1u; i<AllocCount; i++)
4411+
worstSize = core::alignUp(worstSize,alignments[i])+sizes[i];
4412+
if (worstSize>minScratchSize)
4413+
minScratchSize = worstSize;
4414+
}*/
43794415
// if fail then flush and keep trying till space is made
4380-
for (uint32_t t=0; params.scratchForDeviceASBuild->multi_allocate(2u,&offsets[0],&sizes[0],&alignments[0])!=0u; t++)
4416+
for (uint32_t t=0; params.scratchForDeviceASBuild->multi_allocate(AllocCount,&offsets[0],&sizes[0],&alignments[0])!=0u; t++)
43814417
if (t==1) // don't flush right away cause allocator not defragmented yet
43824418
{
4383-
recordBuilds();
4419+
recordBuildCommands();
4420+
// TODO: make sure compute acquires ownership of geometry data for the build
43844421
drainCompute();
43854422
}
4386-
params.scratchForDeviceASBuild->multi_deallocate(2,&offsets[0],&sizes[0],params.compute->getFutureScratchSemaphore());
4423+
// queue up a deferred allocation
4424+
params.scratchForDeviceASBuild->multi_deallocate(AllocCount,&offsets[0],&sizes[0],params.compute->getFutureScratchSemaphore());
43874425
}
4388-
// stream the instance/geometry input in && check dependents
4389-
// unfortunately can't count on large ReBAR heaps so we can't force the `scratchBuffer` to be mapped and writable
4390-
for (const auto& instance : instances)
4426+
// stream the instance/geometry input in
4427+
// unfortunately can't count on large ReBAR heaps so we can't force the `scratchBuffer` to be mapped and writable
4428+
SBufferRange<IGPUBuffer> range = {.offset=offsets[2],.size=sizes[2],.buffer=smart_refctd_ptr<IGPUBuffer>(params.scratchForDeviceASBuild->getBuffer())};
43914429
{
4392-
instance.instance;
4430+
// TODO: make sure the overflow submit work callback is doing some CPU work
4431+
// TODO: write the callbacks
4432+
struct FillInstancePointers
4433+
{
4434+
// uint32_t increaseAlignment(const uint32_t original) override {return sizeof(void*);}
4435+
4436+
size_t operator()(void* dst, const size_t offsetInRange, const size_t blockSize)
4437+
{
4438+
assert(false);
4439+
return 0ul;
4440+
}
4441+
};
4442+
FillInstancePointers fillInstancePointers;
4443+
bool success = params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,fillInstancePointers);
4444+
range.offset = offsets[1];
4445+
range.size = sizes[1];
4446+
struct FillInstances
4447+
{
4448+
// uint32_t increaseAlignment(const uint32_t original) override {return sizeof(void*);}
4449+
4450+
size_t operator()(void* dst, const size_t offsetInRange, const size_t blockSize)
4451+
{
4452+
assert(false);
4453+
return 0ul;
4454+
}
4455+
};
4456+
FillInstances fillInstances;
4457+
params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,fillInstances);
4458+
// TODO: pipeline barrier & ownership release between xfer and compute
4459+
// current recording buffer may have changed
4460+
xferCmdBuf = params.transfer->getCommandBufferForRecording();
4461+
if (!success)
4462+
{
4463+
markFailureInStaging("Uploading Instance Data for TLAS build failed",as,pFoundHash);
4464+
continue;
4465+
}
43934466
}
43944467
// prepare build infos
43954468
auto& buildInfo = buildInfos.emplace_back();
4396-
buildInfo.scratch = {.offset=offsets[0],.buffer=smart_refctd_ptr<IGPUBuffer>(params.scratchForDeviceASBuild->getBuffer())};
4469+
buildInfo.scratch = {.offset=range.offset,.buffer=smart_refctd_ptr(range.buffer)};
43974470
buildInfo.buildFlags = tlasToBuild.getBuildFlags();
43984471
buildInfo.dstAS = as;
4399-
buildInfo.instanceData = {.offset=offsets[1],.buffer=smart_refctd_ptr<IGPUBuffer>(params.scratchForDeviceASBuild->getBuffer())};
4472+
// note we don't build directly from staging, because only very small inputs could come from there and they'd impede the transfer efficiency of the larger ones
4473+
buildInfo.instanceData = {.offset=offsets[1],.buffer=smart_refctd_ptr(range.buffer)};
44004474
// be based cause vectors can grow
44014475
{
44024476
const auto offset = trackedBLASes.size();
44034477
using p_p_BLAS_t = const IGPUBottomLevelAccelerationStructure**;
44044478
buildInfo.trackedBLASes = {reinterpret_cast<const p_p_BLAS_t&>(offset),instanceCount};
44054479
}
4480+
// no special extra byte offset into the instance buffer
44064481
rangeInfos.emplace_back(instanceCount,0u);
44074482
}
4408-
recordBuilds();
4483+
recordBuildCommands();
44094484
computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact TLASes END");
44104485
computeCmdBuf->cmdbuf->endDebugMarker();
44114486
// no longer need this info
@@ -4666,18 +4741,19 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
46664741
auto* pGpuObj = item.first;
46674742
if (depsMissing)
46684743
{
4669-
const auto* hashAsU64 = reinterpret_cast<const uint64_t*>(item.second.value.data);
4670-
logger.log("GPU Obj %s not writing to final cache because conversion of a dependant failed!", system::ILogger::ELL_ERROR, pGpuObj->getObjectDebugName());
4744+
logger.log("GPU Obj %s not writing to final cache because conversion of a dependant failed!",system::ILogger::ELL_ERROR,pGpuObj->getObjectDebugName());
46714745
// wipe self, to let users know
46724746
item.second.value = {};
46734747
continue;
46744748
}
4749+
// The BLASes don't need to do this, because no-one checks for them as dependents and we can substitute the `item.first` in the staging cache right away
46754750
// For TLASes we need to write the compacted TLAS and not the intermediate build to the Cache
46764751
if constexpr (IsTLAS)
46774752
{
46784753
auto found = compactedTLASMap.find(pGpuObj);
46794754
if (found!=compactedTLASMap.end())
46804755
pGpuObj = found->second.get();
4756+
46814757
}
46824758
// We have success now, but ask callback if we write to the new cache.
46834759
if (!params.writeCache(item.second)) // TODO: let the user know the pointer to the GPU Object too?
@@ -4691,10 +4767,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
46914767
// again, need to go bottom up so we can check dependencies being successes
46924768
mergeCache.operator()<ICPUBuffer>();
46934769
mergeCache.operator()<ICPUImage>();
4694-
#ifdef NBL_ACCELERATION_STRUCTURE_CONVERSION
46954770
mergeCache.operator()<ICPUBottomLevelAccelerationStructure>();
46964771
mergeCache.operator()<ICPUTopLevelAccelerationStructure>();
4697-
#endif
46984772
mergeCache.operator()<ICPUBufferView>();
46994773
mergeCache.operator()<ICPUImageView>();
47004774
mergeCache.operator()<ICPUShader>();

0 commit comments

Comments
 (0)