Skip to content

Commit 8f43fef

Browse files
author
devsh
committed
Figure out the TLAS/BLAS compaction logic and swap in cache.
Also update comments about what ends up in `m_gpuObjects`
1 parent 6675224 commit 8f43fef

File tree

3 files changed

+105
-38
lines changed

3 files changed

+105
-38
lines changed

include/nbl/asset/IDescriptorSetLayout.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,16 @@ class IDescriptorSetLayoutBase : public virtual core::IReferenceCounted // TODO:
147147
return getStorageOffset(index);
148148
}
149149

150+
// Weird functions for exceptional situations
151+
inline storage_range_index_t findBindingStorageIndex(const storage_offset_t offset) const
152+
{
153+
const auto found = std::upper_bound(m_storageOffsets, m_storageOffsets+m_count, offset, [](storage_range_index_t a, storage_range_index_t b) -> bool {return a.data < b.data; });
154+
const auto ix = m_storageOffsets - found;
155+
if (ix>=m_count)
156+
return {};
157+
return storage_range_index_t(ix);
158+
}
159+
150160
inline uint32_t getTotalCount() const { return (m_count == 0ull) ? 0u : m_storageOffsets[m_count - 1].data; }
151161

152162
private:

include/nbl/video/utilities/CAssetConverter.h

Lines changed: 26 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -965,7 +965,8 @@ class CAssetConverter : public core::IReferenceCounted
965965
//
966966
inline operator bool() const {return bool(m_converter);}
967967

968-
// until `convert` is called, this will only contain valid entries for items already found in `SInput::readCache`
968+
// Until `convert` is called, the Buffers and Images are not filled with content and Acceleration Structures are not built, unless found in the `SInput::readCache`
969+
// WARNING: The Acceleration Structure Pointer WILL CHANGE after calling `convert` if its patch dictates that it will be compacted! (since AS can't resize)
969970
// TODO: we could also return per-object semaphore values when object is ready for use (would have to propagate two semaphores up through dependants)
970971
template<asset::Asset AssetType>
971972
std::span<const asset_cached_t<AssetType>> getGPUObjects() const {return std::get<vector_t<AssetType>>(m_gpuObjects);}
@@ -1020,34 +1021,36 @@ class CAssetConverter : public core::IReferenceCounted
10201021
core::tuple_transform_t<staging_cache_t,supported_asset_types> m_stagingCaches;
10211022
// need a more explicit list of GPU objects that need device-assisted conversion
10221023
template<asset::Asset AssetType>
1023-
struct ConversionRequest
1024+
struct SConversionRequestBase
10241025
{
10251026
// canonical asset (the one that provides content)
10261027
core::smart_refctd_ptr<const AssetType> canonical;
10271028
// gpu object to transfer canonical's data to or build it from
10281029
asset_traits<AssetType>::video_t* gpuObj;
1029-
union
1030-
{
1031-
// only relevant for images
1032-
uint16_t recomputeMips = 0;
1033-
//
1034-
struct ASBuildParams
1035-
{
1036-
// TODO: buildFlags
1037-
uint8_t host : 1;
1038-
uint8_t compact : 1;
1039-
} asBuildParams;
1040-
};
10411030
};
1042-
template<asset::Asset AssetType>
1043-
using conversion_requests_t = core::vector<ConversionRequest<AssetType>>;
1044-
using convertible_asset_types = core::type_list<
1045-
asset::ICPUBuffer,
1046-
asset::ICPUImage,
1047-
asset::ICPUBottomLevelAccelerationStructure,
1048-
asset::ICPUTopLevelAccelerationStructure
1049-
>;
1050-
core::tuple_transform_t<conversion_requests_t,convertible_asset_types> m_conversionRequests;
1031+
using SConvReqBuffer = SConversionRequestBase<asset::ICPUBuffer>;
1032+
core::vector<SConvReqBuffer> m_bufferConversions;
1033+
struct SConvReqImage : SConversionRequestBase<asset::ICPUImage>
1034+
{
1035+
bool recomputeMips = 0;
1036+
};
1037+
core::vector<SConvReqImage> m_imageConversions;
1038+
template<typename CPUAccelerationStructure>// requires std::is_base_of_v<asset::ICPUAccelerationStructure,CPUAccelerationStructure>
1039+
struct SConvReqAccelerationStructure : SConversionRequestBase<CPUAccelerationStructure>
1040+
{
1041+
constexpr static inline uint64_t WontCompact = (0x1ull<<48)-1;
1042+
inline bool compact() const {return compactedASWriteOffset!=WontCompact;}
1043+
1044+
using build_f = typename CPUAccelerationStructure::BUILD_FLAGS;
1045+
inline void setBuildFlags(const build_f _flags) {buildFlags = static_cast<uint16_t>(_flags);}
1046+
inline build_f getBuildFlags() const {return static_cast<build_f>(buildFlags);}
1047+
1048+
1049+
uint64_t compactedASWriteOffset : 48 = WontCompact;
1050+
uint64_t buildFlags : 16 = static_cast<uint16_t>(build_f::NONE);
1051+
};
1052+
core::vector<SConvReqAccelerationStructure<asset::ICPUBottomLevelAccelerationStructure>> m_blasConversions[2];
1053+
core::vector<SConvReqAccelerationStructure<asset::ICPUTopLevelAccelerationStructure>> m_tlasConversions[2];
10511054

10521055
//
10531056
uint64_t m_minASBuildScratchSize[2] = {0,0};

src/nbl/video/utilities/CAssetConverter.cpp

Lines changed: 69 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3016,14 +3016,13 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
30163016
return;
30173017
}
30183018
}
3019-
// this is super annoying, was hoping metaprogramming with `has_type` would actually work
3020-
auto getConversionRequests = [&]<typename AssetU>()->auto&{return std::get<SReserveResult::conversion_requests_t<AssetU>>(retval.m_conversionRequests);};
3019+
//
30213020
if constexpr (std::is_same_v<AssetType,ICPUBuffer>)
3022-
getConversionRequests.operator()<ICPUBuffer>().emplace_back(core::smart_refctd_ptr<const AssetType>(instance.asset),created.gpuObj.get());;
3021+
retval.m_bufferConversions.emplace_back({core::smart_refctd_ptr<const AssetType>(instance.asset),created.gpuObj.get()});
30233022
if constexpr (std::is_same_v<AssetType,ICPUImage>)
30243023
{
30253024
const uint16_t recomputeMips = created.patch.recomputeMips;
3026-
getConversionRequests.operator()<ICPUImage>().emplace_back(core::smart_refctd_ptr<const AssetType>(instance.asset),created.gpuObj.get(),recomputeMips);
3025+
retval.m_imageConversions.emplace_back({core::smart_refctd_ptr<const AssetType>(instance.asset),created.gpuObj.get()},recomputeMips);
30273026
}
30283027
// TODO: BLAS and TLAS requests
30293028
}
@@ -3337,7 +3336,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
33373336
}
33383337
const auto& found = dfsCache.nodes[metadata[i].patchIndex.value];
33393338
// write it out to the results
3340-
if (const auto& gpuObj=found.gpuObj; gpuObj) // found from the `input.readCache`
3339+
if (const auto& gpuObj=found.gpuObj; gpuObj)
33413340
{
33423341
results[i] = gpuObj;
33433342
// if something with this content hash is in the stagingCache, then it must match the `found->gpuObj`
@@ -3372,6 +3371,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
33723371
auto device = m_params.device;
33733372
const auto reqQueueFlags = reservations.getRequiredQueueFlags();
33743373

3374+
// compacted TLASes need to be substituted in cache and Descriptor Sets
3375+
core::unordered_map<IGPUTopLevelAccelerationStructure*,smart_refctd_ptr<IGPUTopLevelAccelerationStructure>> compactedTLASMap;
33753376
// Anything to do?
33763377
if (reqQueueFlags.value!=IQueue::FAMILY_FLAGS::NONE)
33773378
{
@@ -3536,7 +3537,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
35363537
};
35373538

35383539
// upload Buffers
3539-
auto& buffersToUpload = std::get<SReserveResult::conversion_requests_t<ICPUBuffer>>(reservations.m_conversionRequests);
3540+
auto& buffersToUpload = reservations.m_bufferConversions;
35403541
{
35413542
core::vector<IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier>> ownershipTransfers;
35423543
ownershipTransfers.reserve(buffersToUpload.size());
@@ -3630,7 +3631,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
36303631
origXferStallCallback(tillScratchResettable);
36313632
};
36323633

3633-
auto& imagesToUpload = std::get<SReserveResult::conversion_requests_t<ICPUImage>>(reservations.m_conversionRequests);
3634+
auto& imagesToUpload = reservations.m_imageConversions;
36343635
if (!imagesToUpload.empty())
36353636
{
36363637
//
@@ -4088,7 +4089,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
40884089
}
40894090

40904091
// BLAS builds
4091-
auto& blasToBuild = std::get<SReserveResult::conversion_requests_t<ICPUBottomLevelAccelerationStructure>>(reservations.m_conversionRequests);
4092+
core::unordered_map<IGPUBottomLevelAccelerationStructure*,smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>> compactedBLASMap;
4093+
auto& blasToBuild = reservations.m_blasConversions[0];
40924094
if (const auto blasCount = blasToBuild.size(); blasCount)
40934095
{
40944096
constexpr auto GeometryIsAABBFlag = ICPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT;
@@ -4111,6 +4113,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
41114113
triangles.reserve(totalTriGeoCount);
41124114
triangles.reserve(totalAABBGeoCount);
41134115
}
4116+
#if 0
41144117
for (auto& item : blasToBuild)
41154118
{
41164119
auto* as = item.gpuObj;
@@ -4141,13 +4144,15 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
41414144
computeCmdBuf->cmdbuf->buildAccelerationStructures(buildInfo,rangeInfo);
41424145
}
41434146
}
4147+
#endif
41444148
}
41454149

41464150
// TLAS builds
4147-
auto& tlasToBuild = std::get<SReserveResult::conversion_requests_t<ICPUTopLevelAccelerationStructure>>(reservations.m_conversionRequests);
4151+
auto& tlasToBuild = reservations.m_tlasConversions[0];
41484152
if (!tlasToBuild.empty())
41494153
{
41504154
}
4155+
compactedBLASMap.clear();
41514156

41524157
const bool computeSubmitIsNeeded = submitsNeeded.hasFlags(IQueue::FAMILY_FLAGS::COMPUTE_BIT);
41534158
// first submit transfer
@@ -4183,6 +4188,9 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
41834188
}
41844189

41854190

4191+
// Descriptor Sets need their TLAS descriptors substituted if they've been compacted
4192+
core::vector<IGPUDescriptorSet::SWriteDescriptorSet> tlasRewrites; tlasRewrites.reserve(compactedTLASMap.size());
4193+
core::vector<IGPUDescriptorSet::SDescriptorInfo> tlasInfos; tlasInfos.reserve(compactedTLASMap.size());
41864194
// want to check if deps successfully exist
41874195
auto missingDependent = [&reservations]<Asset AssetType>(const typename asset_traits<AssetType>::video_t* dep)->bool
41884196
{
@@ -4200,13 +4208,14 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
42004208
auto& cache = std::get<CCache<AssetType>>(m_caches);
42014209
cache.m_forwardMap.reserve(cache.m_forwardMap.size()+stagingCache.size());
42024210
cache.m_reverseMap.reserve(cache.m_reverseMap.size()+stagingCache.size());
4211+
constexpr bool IsTLAS = std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>;
42034212
for (auto& item : stagingCache)
42044213
if (item.second.value!=CHashCache::NoContentHash) // didn't get wiped
42054214
{
42064215
// rescan all the GPU objects and find out if they depend on anything that failed, if so add to failure set
42074216
bool depsMissing = false;
42084217
// only go over types we could actually break via missing upload/build (i.e. pipelines are unbreakable)
4209-
if constexpr (std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>)
4218+
if constexpr (IsTLAS)
42104219
{
42114220
// there's no lifetime tracking (refcounting) from TLAS to BLAS, so one just must trust the pre-TLAS-build input validation to do its job
42124221
}
@@ -4225,6 +4234,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
42254234
if (samplers[i])
42264235
depsMissing = missingDependent.operator()<ICPUSampler>(samplers[i].get());
42274236
}
4237+
const auto tlasRewriteOldSize = tlasRewrites.size();
42284238
for (auto i=0u; !depsMissing && i<static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_COUNT); i++)
42294239
{
42304240
const auto type = static_cast<asset::IDescriptor::E_TYPE>(i);
@@ -4251,30 +4261,65 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
42514261
depsMissing = missingDependent.operator()<ICPUBufferView>(static_cast<const IGPUBufferView*>(untypedDesc));
42524262
break;
42534263
case asset::IDescriptor::EC_ACCELERATION_STRUCTURE:
4254-
depsMissing = missingDependent.operator()<ICPUTopLevelAccelerationStructure>(static_cast<const ICPUTopLevelAccelerationStructure*>(untypedDesc));
4264+
{
4265+
const auto* tlas = static_cast<const IGPUTopLevelAccelerationStructure*>(untypedDesc);
4266+
depsMissing = missingDependent.operator()<ICPUTopLevelAccelerationStructure>(tlas);
4267+
if (!depsMissing)
4268+
{
4269+
auto found = compactedTLASMap.find(tlas);
4270+
if (found==compactedTLASMap.end())
4271+
break;
4272+
// written TLAS got compacted, so queue the descriptor for update
4273+
using redirect_t = IDescriptorSetLayoutBase::CBindingRedirect;
4274+
const redirect_t& redirect = layout->getDescriptorRedirect(IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE);
4275+
const auto bindingRange = redirect.findBindingStorageIndex(redirect_t::storage_offset_t(i));
4276+
const auto firstElementOffset = redirect.getStorageOffset(bindingRange).data;
4277+
tlasRewrites.push_back({
4278+
.set = item.first,
4279+
.binding = redirect.getBinding(bindingRange).data,
4280+
.arrayElement = i-firstElementOffset,
4281+
.count = 1, // write them one by one, no point optimizing
4282+
.info = nullptr // for now
4283+
});
4284+
tlasInfos.emplace_back().desc = smart_refctd_ptr<IGPUTopLevelAccelerationStructure>(found->second);
4285+
}
42554286
break;
4287+
}
42564288
default:
42574289
assert(false);
42584290
depsMissing = true;
42594291
break;
42604292
}
42614293
}
42624294
}
4295+
// don't bother overwriting a Descriptor Set that won't be marked as successfully converted (inserted into write cache)
4296+
if (depsMissing)
4297+
{
4298+
tlasRewrites.resize(tlasRewriteOldSize);
4299+
tlasInfos.resize(tlasRewriteOldSize);
4300+
}
42634301
}
4302+
auto* pGpuObj = item.first;
42644303
if (depsMissing)
42654304
{
42664305
const auto* hashAsU64 = reinterpret_cast<const uint64_t*>(item.second.value.data);
4267-
logger.log("GPU Obj %s not writing to final cache because conversion of a dependant failed!", system::ILogger::ELL_ERROR, item.first->getObjectDebugName());
4306+
logger.log("GPU Obj %s not writing to final cache because conversion of a dependant failed!", system::ILogger::ELL_ERROR, pGpuObj->getObjectDebugName());
42684307
// wipe self, to let users know
42694308
item.second.value = {};
42704309
continue;
42714310
}
4272-
if (!params.writeCache(item.second))
4311+
if (!params.writeCache(item.second)) // TODO: let the user know the pointer too?
42734312
continue;
4313+
if constexpr (IsTLAS)
4314+
{
4315+
auto found = compactedTLASMap.find(pGpuObj);
4316+
if (found!=compactedTLASMap.end())
4317+
pGpuObj = found->second.get();
4318+
}
42744319
asset_cached_t<AssetType> cached;
4275-
cached.value = core::smart_refctd_ptr<typename asset_traits<AssetType>::video_t>(item.first);
4320+
cached.value = core::smart_refctd_ptr<typename asset_traits<AssetType>::video_t>(pGpuObj);
4321+
cache.m_reverseMap.emplace(pGpuObj,item.second);
42764322
cache.m_forwardMap.emplace(item.second,std::move(cached));
4277-
cache.m_reverseMap.emplace(item.first,item.second);
42784323
}
42794324
};
42804325
// again, need to go bottom up so we can check dependencies being successes
@@ -4293,6 +4338,15 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
42934338
mergeCache.operator()<ICPURenderpass>();
42944339
mergeCache.operator()<ICPUGraphicsPipeline>();
42954340
mergeCache.operator()<ICPUDescriptorSet>();
4341+
// deal with rewriting the TLASes with compacted ones
4342+
{
4343+
compactedTLASMap.clear();
4344+
auto* infoIt = tlasInfos.data();
4345+
for (auto& write : tlasRewrites)
4346+
write.info = infoIt++;
4347+
if (!tlasRewrites.empty())
4348+
device->updateDescriptorSets(tlasRewrites,{});
4349+
}
42964350
// mergeCache.operator()<ICPUFramebuffer>();
42974351

42984352
// no submit was necessary, so should signal the extra semaphores from the host

0 commit comments

Comments
 (0)