Skip to content

Commit 0f42726

Browse files
author
devsh
committed
implemented BLAS tracking for TLAS device-side copies
1 parent 5b6e20e commit 0f42726

File tree

4 files changed

+105
-34
lines changed

4 files changed

+105
-34
lines changed

include/nbl/video/IGPUAccelerationStructure.h

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,9 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
379379
//
380380
inline uint32_t getMaxInstanceCount() const {return m_maxInstanceCount;}
381381

382+
//
383+
using blas_smart_ptr_t = core::smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>;
384+
382385
// copies
383386
struct CopyInfo
384387
{
@@ -392,7 +395,7 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
392395
const IGPUTopLevelAccelerationStructure* src = nullptr;
393396
asset::SBufferBinding<BufferType> dst = nullptr;
394397
// [optional] Query the tracked BLASes
395-
core::smart_refctd_dynamic_array<core::smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>> trackedBLASes = nullptr;
398+
core::smart_refctd_dynamic_array<blas_smart_ptr_t> trackedBLASes = nullptr;
396399
};
397400
using DeviceCopyToMemoryInfo = CopyToMemoryInfo<IGPUBuffer>;
398401
using HostCopyToMemoryInfo = CopyToMemoryInfo<asset::ICPUBuffer>;
@@ -693,8 +696,6 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
693696
{
694697
return ++m_pendingBuildVer;
695698
}
696-
//
697-
using blas_smart_ptr_t = core::smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>;
698699
// returns number of tracked BLASes if `tracked==nullptr` otherwise writes `*count` tracked BLASes from `first` into `*tracked`
699700
inline void getPendingBuildTrackedBLASes(uint32_t* count, blas_smart_ptr_t* tracked, const build_ver_t buildVer) const
700701
{
@@ -703,10 +704,12 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
703704
// stop multiple threads messing with us
704705
std::lock_guard lk(m_trackingLock);
705706
auto pBLASes = getPendingBuildTrackedBLASes(buildVer);
707+
const auto origCount = *count;
706708
*count = pBLASes ? pBLASes->size():0;
707709
if (!tracked || !pBLASes)
708710
return;
709-
for (auto it=pBLASes->begin(); it!=pBLASes->end(); it++)
711+
auto it = pBLASes->begin();
712+
for (auto i = 0; i<origCount; i++)
710713
*(tracked++) = *(it++);
711714
}
712715
// Useful if TLAS got built externally as well
@@ -747,7 +750,7 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
747750
const uint32_t m_maxInstanceCount;
748751

749752
private:
750-
friend class IGPUCommandBuffer;
753+
friend class IQueue;
751754
inline const core::unordered_set<blas_smart_ptr_t>* getPendingBuildTrackedBLASes(const build_ver_t buildVer) const
752755
{
753756
const auto found = std::find_if(m_pendingBuilds.begin(),m_pendingBuilds.end(),[buildVer](const auto& item)->bool{return item.ordinal==buildVer;});

include/nbl/video/IGPUCommandBuffer.h

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -571,7 +571,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
571571
auto oit = reserveReferences(size);
572572
if (oit)
573573
{
574-
m_TLASToBLASReferenceSets[tlas] = {oit,size};
574+
m_TLASTrackingOps.emplace_back(TLASTrackingWrite{.src={oit,size},.dst=tlas});
575575
while (beginBLASes!=endBLASes)
576576
*(oit++) = core::smart_refctd_ptr<const core::IReferenceCounted>(*(beginBLASes++));
577577
}
@@ -750,7 +750,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
750750
m_state = STATE::INITIAL;
751751

752752
m_boundDescriptorSetsRecord.clear();
753-
m_TLASToBLASReferenceSets.clear();
753+
m_TLASTrackingOps.clear();
754754
m_boundGraphicsPipeline= nullptr;
755755
m_boundComputePipeline= nullptr;
756756
m_boundRayTracingPipeline= nullptr;
@@ -768,7 +768,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
768768
{
769769
deleteCommandList();
770770
m_boundDescriptorSetsRecord.clear();
771-
m_TLASToBLASReferenceSets.clear();
771+
m_TLASTrackingOps.clear();
772772
m_boundGraphicsPipeline= nullptr;
773773
m_boundComputePipeline= nullptr;
774774
m_boundRayTracingPipeline= nullptr;
@@ -909,10 +909,26 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
909909
// or IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_UNUSED_WHILE_PENDING_BIT.
910910
core::unordered_map<const IGPUDescriptorSet*,uint64_t> m_boundDescriptorSetsRecord;
911911

912-
// If the user wants the builds to be tracking, and make the TLAS remember the BLASes that have been built into it.
913-
// NOTE: We know that a TLAS may be rebuilt multiple times per frame on purpose and not only the final BLASes need to be kept alive till submission finishes.
914-
// However, the Command Pool already tracks resources referenced in the Build Infos, so we only need pointers into those records.
915-
core::unordered_map<IGPUTopLevelAccelerationStructure*,std::span<const core::smart_refctd_ptr<const IReferenceCounted>>> m_TLASToBLASReferenceSets;
912+
// If the user wants the builds and copies to be tracking, and make the TLAS remember the BLASes that have been built into it.
913+
// The Command Pool already tracks resources referenced in the Build Infos or Copies From Memory (Deserializations), so we only need pointers into those records.
914+
struct TLASTrackingWrite
915+
{
916+
std::span<const core::smart_refctd_ptr<const IReferenceCounted>> src;
917+
IGPUTopLevelAccelerationStructure* dst;
918+
};
919+
struct TLASTrackingCopy
920+
{
921+
const IGPUTopLevelAccelerationStructure* src;
922+
IGPUTopLevelAccelerationStructure* dst;
923+
};
924+
struct TLASTrackingRead
925+
{
926+
const IGPUTopLevelAccelerationStructure* src;
927+
// For a copy to memory (Serialization), we need to dump the BLASes references
928+
core::smart_refctd_dynamic_array<IGPUTopLevelAccelerationStructure::blas_smart_ptr_t> dst;
929+
};
930+
// operations as they'll be performed in order
931+
core::vector<std::variant<TLASTrackingWrite,TLASTrackingCopy,TLASTrackingRead>> m_TLASTrackingOps;
916932

917933
const IGPUGraphicsPipeline* m_boundGraphicsPipeline;
918934
const IGPUComputePipeline* m_boundComputePipeline;

src/nbl/video/IGPUCommandBuffer.cpp

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -842,10 +842,7 @@ uint32_t IGPUCommandBuffer::buildAccelerationStructures_common(const std::span<c
842842
if constexpr (std::is_same_v<DeviceBuildInfo,IGPUTopLevelAccelerationStructure::DeviceBuildInfo>)
843843
{
844844
const auto blasCount = info.trackedBLASes.size();
845-
if (blasCount)
846-
m_TLASToBLASReferenceSets[info.dstAS] = {oit-blasCount,blasCount};
847-
else
848-
m_TLASToBLASReferenceSets[info.dstAS] = {};
845+
m_TLASTrackingOps.emplace_back(TLASTrackingWrite{.src={oit-blasCount,blasCount},.dst=info.dstAS});
849846
}
850847
}
851848

@@ -890,9 +887,7 @@ bool IGPUCommandBuffer::copyAccelerationStructure(const AccelerationStructure::C
890887
m_noCommands = false;
891888
const bool retval = copyAccelerationStructure_impl(copyInfo.src,copyInfo.dst,copyInfo.compact);
892889
if constexpr (std::is_same_v<AccelerationStructure,IGPUTopLevelAccelerationStructure>)
893-
{
894-
// if (copyInfo.buildVer)
895-
}
890+
m_TLASTrackingOps.emplace_back(TLASTrackingCopy{.src=copyInfo.src,.dst=copyInfo.dst});
896891
return retval;
897892
}
898893
template bool IGPUCommandBuffer::copyAccelerationStructure<IGPUBottomLevelAccelerationStructure>(const IGPUBottomLevelAccelerationStructure::CopyInfo&);
@@ -921,8 +916,7 @@ bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const AccelerationStru
921916
m_noCommands = false;
922917
const bool retval = copyAccelerationStructureToMemory_impl(copyInfo.src,copyInfo.dst);
923918
if constexpr (std::is_same_v<AccelerationStructure,IGPUTopLevelAccelerationStructure>)
924-
{
925-
}
919+
m_TLASTrackingOps.emplace_back(TLASTrackingRead{.src=copyInfo.src,.dst=copyInfo.trackedBLASes});
926920
return retval;
927921
}
928922
template bool IGPUCommandBuffer::copyAccelerationStructureToMemory<IGPUBottomLevelAccelerationStructure>(const IGPUBottomLevelAccelerationStructure::DeviceCopyToMemoryInfo&);
@@ -952,6 +946,16 @@ bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const AccelerationSt
952946
const bool retval = copyAccelerationStructureFromMemory_impl(copyInfo.src,copyInfo.dst);
953947
if constexpr (std::is_same_v<AccelerationStructure,IGPUTopLevelAccelerationStructure>)
954948
{
949+
const auto size = copyInfo.trackedBLASes.size();
950+
auto oit = reserveReferences(size);
951+
if (oit)
952+
{
953+
m_TLASTrackingOps.emplace_back(TLASTrackingWrite{.src={oit,size},.dst=copyInfo.dst});
954+
for (const auto& blas : copyInfo.trackedBLASes)
955+
*(oit++) = core::smart_refctd_ptr<const IReferenceCounted>(blas);
956+
}
957+
else
958+
NBL_LOG_ERROR("out of host memory for BLAS tracking references, TLAS will be copied from memory without BLAS tracking data!");
955959
}
956960
return retval;
957961
}

src/nbl/video/IQueue.cpp

Lines changed: 61 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -149,27 +149,75 @@ IQueue::DeferredSubmitCallback::DeferredSubmitCallback(const SSubmitInfo& info)
149149
auto outRes = m_resources->data();
150150
for (const auto& sema : info.waitSemaphores)
151151
*(outRes++) = smart_ptr(sema.semaphore);
152+
// track our own versions
153+
core::unordered_map<const IGPUTopLevelAccelerationStructure*,IGPUTopLevelAccelerationStructure::build_ver_t> m_readTLASVersions;
154+
// get the TLAS BLAS tracking info and assign a pending build version number
155+
for (const auto& cb : info.commandBuffers)
156+
for (const auto& var : cb.cmdbuf->m_TLASTrackingOps)
157+
{
158+
const IGPUTopLevelAccelerationStructure* src = nullptr;
159+
switch (var.index())
160+
{
161+
case 1:
162+
src = std::get<1>(var).src;
163+
break;
164+
case 2:
165+
src = std::get<2>(var).src;
166+
break;
167+
}
168+
if (src)
169+
m_readTLASVersions.insert({src,src->getPendingBuildVer()});
170+
}
152171
for (const auto& cb : info.commandBuffers)
153172
{
154173
*(outRes++) = smart_ptr(cb.cmdbuf);
155-
// get the TLAS BLAS tracking info and assign a pending build version number
156-
for (const auto& refSet : cb.cmdbuf->m_TLASToBLASReferenceSets)
174+
for (const auto& var : cb.cmdbuf->m_TLASTrackingOps)
175+
switch (var.index())
157176
{
158-
const auto tlas = refSet.first;
159-
using iterator = decltype(refSet.second)::iterator;
160-
struct CustomIterator
177+
case 0:
161178
{
162-
inline bool operator!=(const CustomIterator& other) const {return ptr!=other.ptr;}
179+
const IGPUCommandBuffer::TLASTrackingWrite& op = std::get<0>(var);
180+
using iterator = decltype(op.src)::iterator;
181+
struct CustomIterator
182+
{
183+
inline bool operator!=(const CustomIterator& other) const { return ptr != other.ptr; }
163184

164-
inline CustomIterator operator++() {return {ptr++};}
185+
inline CustomIterator operator++() { return { ptr++ }; }
165186

166-
inline const IGPUBottomLevelAccelerationStructure* operator*() const {return dynamic_cast<const IGPUBottomLevelAccelerationStructure*>(ptr->get());}
187+
inline const IGPUBottomLevelAccelerationStructure* operator*() const { return dynamic_cast<const IGPUBottomLevelAccelerationStructure*>(ptr->get()); }
167188

168-
iterator ptr;
169-
};
170-
const auto buildVer = tlas->pushTrackedBLASes<CustomIterator>({refSet.second.begin()},{refSet.second.end()});
171-
// in theory could assert no duplicate entries, but thats obvious
172-
m_TLASBuilds[tlas] = buildVer;
189+
iterator ptr;
190+
};
191+
m_readTLASVersions[op.dst] = m_TLASBuilds[op.dst] = op.dst->pushTrackedBLASes<CustomIterator>({op.src.begin()},{op.src.end()});
192+
break;
193+
}
194+
case 1:
195+
{
196+
const IGPUCommandBuffer::TLASTrackingCopy& op = std::get<1>(var);
197+
// not sure if even legal, but it would deadlock us
198+
if (op.src==op.dst)
199+
break;
200+
const auto ver = m_readTLASVersions.find(op.src)->second;
201+
// stop multiple threads messing with us
202+
std::lock_guard lk(op.src->m_trackingLock);
203+
const auto* pSrcBLASes = op.src->getPendingBuildTrackedBLASes(ver);
204+
assert(pSrcBLASes);
205+
m_readTLASVersions[op.dst] = m_TLASBuilds[op.dst] = op.dst->pushTrackedBLASes(pSrcBLASes->begin(),pSrcBLASes->end());
206+
break;
207+
}
208+
case 2:
209+
{
210+
const IGPUCommandBuffer::TLASTrackingRead& op = std::get<2>(var);
211+
const auto ver = m_readTLASVersions.find(op.src)->second;
212+
uint32_t count = op.dst->size();
213+
op.src->getPendingBuildTrackedBLASes(&count,op.dst->data(),ver);
214+
if (count>op.dst->size())
215+
cb.cmdbuf->getOriginDevice()->getLogger()->log("BLAS output array too small, should be %d, only wrote out %d BLAS references to destination",system::ILogger::ELL_ERROR,count,op.dst->size());
216+
break;
217+
}
218+
default:
219+
assert(false);
220+
break;
173221
}
174222
}
175223
// We don't hold the last signal semaphore, because the timeline does as an Event trigger.

0 commit comments

Comments
 (0)