Skip to content

Commit a66c565

Browse files
author
devsh
committed
finish the hashing of acceleration structures, and defer the TLAS write to Descriptor set
1 parent 67fc1e8 commit a66c565

File tree

3 files changed

+91
-109
lines changed

3 files changed

+91
-109
lines changed

include/nbl/asset/ICPUAccelerationStructure.h

Lines changed: 6 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -135,26 +135,12 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo
135135
return cp;
136136
}
137137

138-
//
139-
inline size_t getDependantCount() const override
140-
{
141-
if (!m_geometryPrimitiveCount)
142-
return 0;
143-
if (m_buildFlags.hasFlags(BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT))
144-
return m_AABBGeoms ? m_AABBGeoms->size():0;
145-
else if (m_triangleGeoms)
146-
{
147-
if (usesMotion())
148-
return m_triangleGeoms->size()*3;
149-
else
150-
return m_triangleGeoms->size()*2;
151-
}
152-
return 0;
153-
}
138+
// Do not report anything as a dependant, we'll simply drop the data instead of discarding its contents
139+
inline size_t getDependantCount() const override {return 0;}
154140

155141
inline core::blake3_hash_t computeContentHash() const override
156142
{
157-
if (!m_geometryPrimitiveCount)
143+
if (!missingContent())
158144
return INVALID_HASH;
159145
const bool isAABB = m_buildFlags.hasFlags(BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT);
160146
core::blake3_hasher hasher;
@@ -244,43 +230,15 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo
244230

245231
inline bool missingContent() const override
246232
{
247-
return !m_triangleGeoms && !m_AABBGeoms && !m_geometryPrimitiveCount;
233+
return !m_geometryPrimitiveCount || !m_triangleGeoms && !m_AABBGeoms;
248234
}
249235

250236
protected:
251237
virtual ~ICPUBottomLevelAccelerationStructure() = default;
252238

253-
inline IAsset* getDependant_impl(const size_t ix) override
254-
{
255-
const ICPUBuffer* buffer = nullptr;
256-
// `ix` is always less than `getDependantCount()`
257-
assert(m_geometryPrimitiveCount);
258-
if (m_buildFlags.hasFlags(BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT))
259-
buffer = m_AABBGeoms ? m_AABBGeoms->operator[](ix).data.buffer.get():nullptr;
260-
else if (m_triangleGeoms)
261-
{
262-
const auto geomCount = m_triangleGeoms->size();
263-
const auto subResourceIx = ix/geomCount;
264-
const auto& triangles = m_triangleGeoms->operator[](ix-subResourceIx*geomCount);
265-
switch (subResourceIx)
266-
{
267-
case 0:
268-
buffer = triangles.indexData.buffer.get();
269-
break;
270-
case 1:
271-
buffer = triangles.vertexData[0].buffer.get();
272-
break;
273-
case 2:
274-
buffer = triangles.vertexData[1].buffer.get();
275-
break;
276-
default:
277-
break;
278-
}
279-
}
280-
return const_cast<ICPUBuffer*>(buffer);
281-
}
239+
inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;}
282240

283-
inline void discardContent_impl() //TODO: sort this out later, override
241+
inline void discardContent_impl() override
284242
{
285243
m_triangleGeoms = nullptr;
286244
m_AABBGeoms = nullptr;

include/nbl/video/utilities/CAssetConverter.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1127,6 +1127,26 @@ class CAssetConverter : public core::IReferenceCounted
11271127
};
11281128
using cpu_to_gpu_blas_map_t = core::unordered_map<const asset::ICPUBottomLevelAccelerationStructure*,BLASUsedInTLASBuild>;
11291129
cpu_to_gpu_blas_map_t m_blasBuildMap;
1130+
//
1131+
struct SDeferredTLASWrite
1132+
{
1133+
inline bool operator==(const SDeferredTLASWrite& other) const = default;
1134+
1135+
IGPUDescriptorSet* dstSet;
1136+
uint32_t binding;
1137+
uint32_t arrayElement;
1138+
};
1139+
struct SDeferredTLASWriteHasher
1140+
{
1141+
inline size_t operator()(const SDeferredTLASWrite& write) const
1142+
{
1143+
size_t retval = std::bit_cast<size_t>(write.dstSet);
1144+
core::hash_combine(retval,write.binding);
1145+
core::hash_combine(retval,write.arrayElement);
1146+
return retval;
1147+
}
1148+
};
1149+
core::unordered_map<SDeferredTLASWrite,core::smart_refctd_ptr<IGPUTopLevelAccelerationStructure>,SDeferredTLASWriteHasher> m_deferredTLASDescriptorWrites;
11301150

11311151
//
11321152
core::bitflag<IQueue::FAMILY_FLAGS> m_queueFlags = IQueue::FAMILY_FLAGS::NONE;

src/nbl/video/utilities/CAssetConverter.cpp

Lines changed: 65 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -455,6 +455,7 @@ class AssetVisitor : public CRTP
455455
for (size_t i=0; i<blasInstances.size(); i++)
456456
{
457457
const auto* blas = blasInstances[i].getBase().blas.get();
458+
// TODO: can one disable instances during builds?
458459
if (!blas)
459460
return false;
460461
CAssetConverter::patch_t<ICPUBottomLevelAccelerationStructure> patch = {blas};
@@ -1145,41 +1146,40 @@ bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t<ICPUBuffer> loo
11451146
}
11461147
bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t<ICPUBottomLevelAccelerationStructure> lookup)
11471148
{
1149+
hasher << lookup.patch->isMotion;
1150+
// overriden flags
1151+
hasher << lookup.patch->getBuildFlags(lookup.asset);
11481152
// extras from the patch
11491153
hasher << lookup.patch->hostBuild;
11501154
hasher << lookup.patch->compactAfterBuild;
1151-
// overriden flags
1152-
hasher << lookup.patch->isMotion;
1153-
hasher << lookup.patch->getBuildFlags(lookup.asset);
11541155
// finally the contents
1155-
//TODO: hasher << lookup.asset->getContentHash();
1156+
hasher << lookup.asset->getContentHash();
11561157
return true;
11571158
}
11581159
bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t<ICPUTopLevelAccelerationStructure> lookup)
11591160
{
1161+
hasher << lookup.patch->isMotion;
1162+
// overriden flags
11601163
const auto* asset = lookup.asset;
1161-
#if 0
1162-
//
1164+
hasher << lookup.patch->getBuildFlags(asset);
1165+
// extras from the patch
1166+
hasher << lookup.patch->hostBuild;
1167+
hasher << lookup.patch->compactAfterBuild;
1168+
const auto instances = asset->getInstances();
1169+
hasher << instances.size();
11631170
AssetVisitor<HashVisit<ICPUTopLevelAccelerationStructure>> visitor = {
11641171
*this,
11651172
{asset,static_cast<const PatchOverride*>(patchOverride)->uniqueCopyGroupID},
11661173
*lookup.patch
11671174
};
11681175
if (!visitor())
11691176
return false;
1170-
// extras from the patch
1171-
hasher << lookup.patch->hostBuild;
1172-
hasher << lookup.patch->compactAfterBuild;
1173-
// overriden flags
1174-
hasher << lookup.patch->isMotion;
1175-
hasher << lookup.patch->getBuildFlags(lookup.asset);
1176-
const auto instances = asset->getInstances();
11771177
// important two passes do not give identical data due to variable length polymorphic array being hashed
11781178
for (const auto& instance : instances)
11791179
hasher << instance.getType();
11801180
for (const auto& instance : instances)
11811181
{
1182-
std::visit([&hasher](const auto& typedInstance)->void
1182+
std::visit([&](const auto& typedInstance)->void
11831183
{
11841184
using instance_t = std::decay_t<decltype(typedInstance)>;
11851185
// the BLAS pointers (the BLAS contents already get hashed via asset visitor and `getDependent`, its only the metadate we need to hash
@@ -1188,7 +1188,6 @@ bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t<ICPUTopLevelAcc
11881188
instance.instance
11891189
);
11901190
}
1191-
#endif
11921191
return true;
11931192
}
11941193
bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t<ICPUImage> lookup)
@@ -4650,7 +4649,7 @@ if (worstSize>minScratchSize)
46504649
if (newWritten>=blockSize)
46514650
return bytesWritten;
46524651
auto found = blasBuildMap->find(instance.getBase().blas.get());
4653-
assert(found!=blasBuildMap.end());
4652+
assert(found!=blasBuildMap->end());
46544653
const auto& blas = found->second.gpuBLAS;
46554654
dst = IGPUTopLevelAccelerationStructure::writeInstance(dst,instance,blas.get()->getReferenceForDeviceOperations());
46564655
dedupBLASesUsed->emplace(blas);
@@ -4752,6 +4751,7 @@ if (worstSize>minScratchSize)
47524751
else
47534752
compactedOwnershipReleaseIndices.push_back(~0u);
47544753
}
4754+
reservations.m_blasBuildMap.clear();
47554755
// finish the last batch
47564756
recordBuildCommands();
47574757
if (!flushRanges.empty())
@@ -4918,9 +4918,7 @@ if (worstSize>minScratchSize)
49184918
}
49194919

49204920

4921-
// Descriptor Sets need their TLAS descriptors substituted if they've been compacted
4922-
core::vector<IGPUDescriptorSet::SWriteDescriptorSet> tlasRewrites; tlasRewrites.reserve(compactedTLASMap.size());
4923-
core::vector<IGPUDescriptorSet::SDescriptorInfo> tlasInfos; tlasInfos.reserve(compactedTLASMap.size());
4921+
// Descriptor Sets need their TLAS descriptors substituted if they've been compacted
49244922
// want to check if deps successfully exist
49254923
auto missingDependent = [&reservations]<Asset AssetType>(const typename asset_traits<AssetType>::video_t* dep)->bool
49264924
{
@@ -4966,7 +4964,6 @@ if (worstSize>minScratchSize)
49664964
if (samplers[i])
49674965
depsMissing = missingDependent.operator()<ICPUSampler>(samplers[i].get());
49684966
}
4969-
const auto tlasRewriteOldSize = tlasRewrites.size();
49704967
for (auto i=0u; !depsMissing && i<static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_COUNT); i++)
49714968
{
49724969
const auto type = static_cast<asset::IDescriptor::E_TYPE>(i);
@@ -4995,27 +4992,21 @@ if (worstSize>minScratchSize)
49954992
case asset::IDescriptor::EC_ACCELERATION_STRUCTURE:
49964993
{
49974994
const auto* tlas = static_cast<const IGPUTopLevelAccelerationStructure*>(untypedDesc);
4998-
depsMissing = missingDependent.operator()<ICPUTopLevelAccelerationStructure>(tlas);
4999-
if (!depsMissing)
5000-
{
5001-
// TODO: Descriptor sets and other things still hold old non-compacted TLASes which balloons our peak memory usage, theoretically can drop them as soon as the compaction submit is done. Maybe defer writing TLAS into acceleration structure until conversions are done?
5002-
auto found = compactedTLASMap.find(tlas);
5003-
if (found==compactedTLASMap.end())
5004-
break;
5005-
// written TLAS got compacted, so queue the descriptor for update
5006-
using redirect_t = IDescriptorSetLayoutBase::CBindingRedirect;
5007-
const redirect_t& redirect = layout->getDescriptorRedirect(IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE);
5008-
const auto bindingRange = redirect.findBindingStorageIndex(redirect_t::storage_offset_t(i));
5009-
const auto firstElementOffset = redirect.getStorageOffset(bindingRange).data;
5010-
tlasRewrites.push_back(IGPUDescriptorSet::SWriteDescriptorSet{
5011-
.dstSet = item.first,
5012-
.binding = redirect.getBinding(bindingRange).data,
5013-
.arrayElement = i-firstElementOffset,
5014-
.count = 1, // write them one by one, no point optimizing
5015-
.info = nullptr // for now, will set once the vector of infos stops growing
5016-
});
5017-
tlasInfos.emplace_back().desc = smart_refctd_ptr<IGPUTopLevelAccelerationStructure>(found->second);
5018-
}
4995+
// successfully written a TLAS into the binding, nothing to check
4996+
if (tlas)
4997+
break;
4998+
// we have a null TLAS in the binding, and we have to check if we were supposed to have one in it
4999+
using redirect_t = IDescriptorSetLayoutBase::CBindingRedirect;
5000+
const redirect_t& redirect = layout->getDescriptorRedirect(IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE);
5001+
const auto bindingRange = redirect.findBindingStorageIndex(redirect_t::storage_offset_t(i));
5002+
const auto firstElementOffset = redirect.getStorageOffset(bindingRange).data;
5003+
const auto foundWrite = reservations.m_deferredTLASDescriptorWrites.find({
5004+
.dstSet = item.first,
5005+
.binding = redirect.getBinding(bindingRange).data,
5006+
.arrayElement = i-firstElementOffset
5007+
});
5008+
// was scheduled to write some TLAS to this binding, but TLAS is now null
5009+
depsMissing = foundWrite!=reservations.m_deferredTLASDescriptorWrites.end() && !foundWrite->second;
50195010
break;
50205011
}
50215012
default:
@@ -5025,12 +5016,6 @@ if (worstSize>minScratchSize)
50255016
}
50265017
}
50275018
}
5028-
// don't bother overwriting a Descriptor Set that won't be marked as successfully converted (inserted into write cache)
5029-
if (depsMissing)
5030-
{
5031-
tlasRewrites.resize(tlasRewriteOldSize);
5032-
tlasInfos.resize(tlasRewriteOldSize);
5033-
}
50345019
}
50355020
auto* pGpuObj = item.first;
50365021
if (depsMissing)
@@ -5040,7 +5025,6 @@ if (worstSize>minScratchSize)
50405025
item.second.value = {};
50415026
continue;
50425027
}
5043-
// TODO: we could just hotswap the `pGpuObj` in staging and write it to Descriptor Set here instead
50445028
// The BLASes don't need to do this, because no-one checks for them as dependents and we can substitute the `item.first` in the staging cache right away
50455029
// For TLASes we need to write the compacted TLAS and not the intermediate build to the Cache
50465030
if constexpr (IsTLAS)
@@ -5074,23 +5058,43 @@ if (worstSize>minScratchSize)
50745058
mergeCache.operator()<ICPUComputePipeline>();
50755059
mergeCache.operator()<ICPURenderpass>();
50765060
mergeCache.operator()<ICPUGraphicsPipeline>();
5077-
mergeCache.operator()<ICPUDescriptorSet>();
5078-
// TODO: should be done during `mergeCache.operator()<ICPUDescriptorSet>`
5079-
// deal with rewriting the TLASes with compacted ones
5061+
// write the TLASes into Descriptor Set finally
5062+
if (auto& tlasWriteMap=reservations.m_deferredTLASDescriptorWrites; !tlasWriteMap.empty())
50805063
{
5081-
// not strictly necessary, just provoking refcounting bugs right away if they exist
5082-
compactedTLASMap.clear();
5083-
auto* infoIt = tlasInfos.data();
5084-
// writes map 1:1 with infos, the lazy way, can finally write the pointer as vector stops growing
5085-
for (auto& write : tlasRewrites)
5086-
write.info = infoIt++;
5087-
if (!tlasRewrites.empty())
5064+
core::vector<IGPUDescriptorSet::SWriteDescriptorSet> writes;
5065+
writes.reserve(tlasWriteMap.size());
5066+
core::vector<IGPUDescriptorSet::SDescriptorInfo> infos(writes.size());
5067+
auto* pInfo = infos.data();
5068+
for (auto& inWrite : tlasWriteMap)
50885069
{
5089-
const bool success = device->updateDescriptorSets(tlasRewrites,{});
5090-
// There's no point in any fault handling, everything we have done should have been valid
5091-
assert(success);
5070+
auto& tlas = inWrite.second;
5071+
assert(tlas);
5072+
if (missingDependent.operator()<ICPUTopLevelAccelerationStructure>(tlas.get()))
5073+
{
5074+
tlas = nullptr;
5075+
continue;
5076+
}
5077+
if (const auto foundCompacted=compactedTLASMap.find(tlas.get()); foundCompacted!=compactedTLASMap.end())
5078+
tlas = foundCompacted->second;
5079+
pInfo->desc = tlas;
5080+
writes.push_back({
5081+
.dstSet = inWrite.first.dstSet,
5082+
.binding = inWrite.first.binding,
5083+
.arrayElement = inWrite.first.arrayElement,
5084+
.count = 1,
5085+
.info = pInfo++
5086+
});
50925087
}
5088+
// not strictly necessary, just provoking refcounting bugs right away if they exist
5089+
compactedTLASMap.clear();
5090+
// if the descriptor write fails, we make the Descriptor Sets behave as-if the TLAS build failed (dep is missing)
5091+
if (!writes.empty() && !device->updateDescriptorSets(writes,{}))
5092+
for (auto& inWrite : tlasWriteMap)
5093+
inWrite.second = nullptr;
50935094
}
5095+
mergeCache.operator()<ICPUDescriptorSet>();
5096+
// needed for the IGPUDescriptorSets to check if TLAS exists/was written, can be released now
5097+
reservations.m_deferredTLASDescriptorWrites.clear();
50945098
// mergeCache.operator()<ICPUFramebuffer>();
50955099

50965100
// no submit was necessary, so should signal the extra semaphores from the host

0 commit comments

Comments
 (0)