Skip to content

Commit 0f2f626

Browse files
Asset Converter cleanup, Acceleration Structure reserve implemented
2 parents bb84ad9 + 55703e5 commit 0f2f626

File tree

4 files changed

+1007
-964
lines changed

4 files changed

+1007
-964
lines changed

include/nbl/video/IGPUAccelerationStructure.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -638,6 +638,9 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
638638
// I don't do an actual union because the preceeding members don't play nicely with alignment of `core::matrix3x4SIMD` and Vulkan requires this struct to be packed
639639
SRTMotionInstance<blas_ref_t> largestUnionMember = {};
640640
static_assert(alignof(SRTMotionInstance<blas_ref_t>)==8ull);
641+
642+
public:
643+
constexpr static inline size_t LargestUnionMemberSize = sizeof(largestUnionMember);
641644
};
642645
using DevicePolymorphicInstance = PolymorphicInstance<IGPUBottomLevelAccelerationStructure::device_op_ref_t>;
643646
using HostPolymorphicInstance = PolymorphicInstance<IGPUBottomLevelAccelerationStructure::host_op_ref_t>;
@@ -664,6 +667,8 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
664667

665668
//
666669
using build_ver_t = uint32_t;
670+
//
671+
inline build_ver_t getPendingBuildVer() const {return m_pendingBuildVer;}
667672
// this gets called when execution is sure to happen 100%, e.g. not during command recording but during submission
668673
inline build_ver_t registerNextBuildVer()
669674
{

include/nbl/video/utilities/CAssetConverter.h

Lines changed: 61 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -900,6 +900,9 @@ class CAssetConverter : public core::IReferenceCounted
900900
IGPUPipelineCache* pipelineCache = nullptr;
901901
// optional, defaults to the device
902902
IDeviceMemoryAllocator* allocator = nullptr;
903+
// optional, defaults to worst case (Apple Silicon page size)
904+
uint32_t scratchForDeviceASBuildMinAllocSize = 1<<14;
905+
uint32_t scratchForHostASBuildMinAllocSize = 1<<14;
903906
};
904907
// Split off from inputs because only assets that build on IPreHashed need uploading
905908
struct SConvertParams
@@ -970,7 +973,14 @@ class CAssetConverter : public core::IReferenceCounted
970973

971974
public:
972975
template<asset::Asset AssetType>
973-
using staging_cache_t = core::unordered_map<typename asset_traits<AssetType>::video_t*,typename CCache<AssetType>::key_t>;
976+
struct staging_cache_key
977+
{
978+
core::smart_refctd_ptr<typename asset_traits<AssetType>::video_t> gpuRef;
979+
typename CCache<AssetType>::key_t cacheKey;
980+
};
981+
// it may seem weird storing both a smart pointer and a raw pointer, but the reason is to be able to drop a refcount while not loosing the key for lookup
982+
template<asset::Asset AssetType>
983+
using staging_cache_t = core::unordered_map<const typename asset_traits<AssetType>::video_t*,staging_cache_key<AssetType>>;
974984

975985
inline SReserveResult(SReserveResult&&) = default;
976986
inline SReserveResult(const SReserveResult&) = delete;
@@ -1000,7 +1010,12 @@ class CAssetConverter : public core::IReferenceCounted
10001010
assert(m_minASBuildScratchSize[forHostOps]<=m_maxASBuildScratchSize[forHostOps]);
10011011
return m_maxASBuildScratchSize[forHostOps];
10021012
}
1003-
// TODO: `getMinCompactedASAllocatorSpace`
1013+
// We do all compactions on the Device for simplicity
1014+
inline uint64_t getMinCompactedASAllocatorSpace() const
1015+
{
1016+
assert(m_compactedASMaxMemory == 0 || willDeviceASBuild() || willHostASBuild());
1017+
return m_compactedASMaxMemory;
1018+
}
10041019
// tells you if you need to provide a valid `SConvertParams::scratchForDeviceASBuild`
10051020
inline bool willDeviceASBuild() const {return getMinASBuildScratchSize(false)>0;}
10061021
// tells you if you need to provide a valid `SConvertParams::scratchForHostASBuild`
@@ -1013,8 +1028,7 @@ class CAssetConverter : public core::IReferenceCounted
10131028
// tells you if you need to provide a valid `SConvertParams::compactedASAllocator`
10141029
inline bool willCompactAS() const
10151030
{
1016-
assert(!m_willCompactSomeAS || willDeviceASBuild() || willHostASBuild());
1017-
return m_willCompactSomeAS;
1031+
return getMinCompactedASAllocatorSpace()!=0;
10181032
}
10191033

10201034
//
@@ -1057,21 +1071,10 @@ class CAssetConverter : public core::IReferenceCounted
10571071
return enqueueSuccess;
10581072
}
10591073

1060-
// public only because `GetDependantVisit<ICPUDescriptorSet>` needs it
1061-
struct SDeferredTLASWrite
1062-
{
1063-
inline bool operator==(const SDeferredTLASWrite& other) const
1064-
{
1065-
return dstSet == other.dstSet && binding == other.binding && arrayElement == other.arrayElement;
1066-
}
1067-
1068-
IGPUDescriptorSet* dstSet;
1069-
uint32_t binding;
1070-
uint32_t arrayElement;
1071-
core::smart_refctd_ptr<IGPUTopLevelAccelerationStructure> tlas;
1072-
};
10731074
private:
10741075
friend class CAssetConverter;
1076+
// internal classes
1077+
template<asset::Asset AssetType> friend class GetDependantVisit;
10751078

10761079
inline SReserveResult() = default;
10771080

@@ -1087,70 +1090,70 @@ class CAssetConverter : public core::IReferenceCounted
10871090

10881091
// we don't insert into the writeCache until conversions are successful
10891092
core::tuple_transform_t<staging_cache_t,supported_asset_types> m_stagingCaches;
1093+
10901094
// need a more explicit list of GPU objects that need device-assisted conversion
1091-
template<asset::Asset AssetType>
1092-
struct SConversionRequestBase
1093-
{
1094-
// canonical asset (the one that provides content)
1095-
core::smart_refctd_ptr<const AssetType> canonical;
1096-
// gpu object to transfer canonical's data to or build it from
1097-
asset_traits<AssetType>::video_t* gpuObj;
1098-
};
1099-
using SConvReqBuffer = SConversionRequestBase<asset::ICPUBuffer>;
1100-
core::vector<SConvReqBuffer> m_bufferConversions;
1101-
struct SConvReqImage : SConversionRequestBase<asset::ICPUImage>
1095+
core::unordered_map<IGPUBuffer*,core::smart_refctd_ptr<const asset::ICPUBuffer>> m_bufferConversions;
1096+
struct SConvReqImage
11021097
{
1098+
core::smart_refctd_ptr<const asset::ICPUImage> canonical = nullptr;
11031099
uint16_t recomputeMips = 0;
11041100
};
1105-
core::vector<SConvReqImage> m_imageConversions;
1101+
core::unordered_map<IGPUImage*,SConvReqImage> m_imageConversions;
11061102
template<typename CPUAccelerationStructure>
1107-
struct SConvReqAccelerationStructure : SConversionRequestBase<CPUAccelerationStructure>
1103+
struct SConvReqAccelerationStructure
11081104
{
1109-
constexpr static inline uint64_t WontCompact = (0x1ull<<48)-1;
1110-
inline bool compact() const {return compactedASWriteOffset!=WontCompact;}
1111-
11121105
using build_f = typename asset_traits<CPUAccelerationStructure>::video_t::BUILD_FLAGS;
11131106
inline void setBuildFlags(const build_f _flags) {buildFlags = static_cast<uint16_t>(_flags);}
11141107
inline build_f getBuildFlags() const {return static_cast<build_f>(buildFlags);}
11151108

1116-
1117-
uint64_t scratchSize;
1118-
uint64_t compactedASWriteOffset : 48 = WontCompact;
1119-
uint64_t buildFlags : 16 = static_cast<uint16_t>(build_f::NONE);
1109+
core::smart_refctd_ptr<const CPUAccelerationStructure> canonical = nullptr;
1110+
uint64_t scratchSize : 47 = 0;
1111+
uint64_t buildFlags : 16 = 0;
1112+
uint64_t compact : 1;
1113+
// scratch + input size also accounting for worst case padding due to alignment
1114+
uint64_t buildSize;
1115+
};
1116+
using SConvReqBLASMap = core::unordered_map<IGPUBottomLevelAccelerationStructure*,SConvReqAccelerationStructure<asset::ICPUBottomLevelAccelerationStructure>>;
1117+
SConvReqBLASMap m_blasConversions[2];
1118+
struct SConvReqTLAS : SConvReqAccelerationStructure<asset::ICPUTopLevelAccelerationStructure>
1119+
{
1120+
// This tracks non-root BLASes which are needed for a subsequent TLAS build.
1121+
// Because the copy group ID of the BLAS can only depend on the copy group and pointer of the TLAS and BLAS,
1122+
// we can be sure that all instances of the same BLAS within a TLAS will have the same copy group ID and use a map instead of a vector for storage
1123+
// Note that even things which are NOT in the staging cache are tracked here to make sure they don't finish their lifetimes prematurely.
1124+
using cpu_to_gpu_blas_map_t = core::unordered_map<const asset::ICPUBottomLevelAccelerationStructure*,core::smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>>;
1125+
cpu_to_gpu_blas_map_t instanceMap;
11201126
};
1121-
using SConvReqBLAS = SConvReqAccelerationStructure<asset::ICPUBottomLevelAccelerationStructure>;
1122-
core::vector<SConvReqBLAS> m_blasConversions[2];
1123-
using SConvReqTLAS = SConvReqAccelerationStructure<asset::ICPUTopLevelAccelerationStructure>;
1124-
core::vector<SConvReqTLAS> m_tlasConversions[2];
1127+
using SConvReqTLASMap = core::unordered_map<IGPUTopLevelAccelerationStructure*,SConvReqTLAS>;
1128+
SConvReqTLASMap m_tlasConversions[2];
11251129

1126-
// 0 for device builds, 1 for host builds
1130+
// array index 0 for device builds, 1 for host builds
11271131
uint64_t m_minASBuildScratchSize[2] = {0,0};
11281132
uint64_t m_maxASBuildScratchSize[2] = {0,0};
1129-
// TODO: make the compaction count the size
1130-
// We do all compactions on the Device for simplicity
1131-
uint8_t m_willCompactSomeAS : 1 = false;
1132-
// This tracks non-root BLASes which are needed for a subsequent TLAS build. Note that even things which are NOT in the staging cache are tracked here to make sure they don't finish their lifetimes early.
1133-
struct BLASUsedInTLASBuild
1133+
uint64_t m_compactedASMaxMemory = 0;
1134+
//
1135+
struct SDeferredTLASWrite
11341136
{
1135-
// This is the BLAS meant to be used for the instance, note that compaction of a BLAS overwrites the initial values at the end of `reserve`
1136-
core::smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure> gpuBLAS;
1137-
uint64_t buildDuringConvertCall : 1 = false;
1138-
// internal micro-refcount which lets us know when we should remove the entry from the map below
1139-
uint64_t remainingUsages : 63 = 0;
1137+
inline bool operator==(const SDeferredTLASWrite& other) const
1138+
{
1139+
return dstSet==other.dstSet && storageOffset.data==other.storageOffset.data;
1140+
}
1141+
1142+
IGPUDescriptorSet* dstSet;
1143+
// binding and array element rolled up into one
1144+
IGPUDescriptorSetLayout::CBindingRedirect::storage_offset_t storageOffset;
11401145
};
1141-
using cpu_to_gpu_blas_map_t = core::unordered_map<const asset::ICPUBottomLevelAccelerationStructure*,BLASUsedInTLASBuild>;
1142-
cpu_to_gpu_blas_map_t m_blasBuildMap;
11431146
struct SDeferredTLASWriteHasher
11441147
{
11451148
inline size_t operator()(const SDeferredTLASWrite& write) const
11461149
{
1147-
size_t retval = std::bit_cast<size_t>(write.dstSet);
1148-
core::hash_combine(retval,write.binding);
1149-
core::hash_combine(retval,write.arrayElement);
1150+
size_t retval = write.storageOffset.data;
1151+
core::hash_combine(retval,write.dstSet);
11501152
return retval;
11511153
}
11521154
};
1153-
core::unordered_set<SDeferredTLASWrite,SDeferredTLASWriteHasher> m_deferredTLASDescriptorWrites;
1155+
using compacted_tlas_rewrite_set_t = core::unordered_set<SDeferredTLASWrite,SDeferredTLASWriteHasher>;
1156+
compacted_tlas_rewrite_set_t m_potentialTLASRewrites;
11541157

11551158
//
11561159
core::bitflag<IQueue::FAMILY_FLAGS> m_queueFlags = IQueue::FAMILY_FLAGS::NONE;

include/nbl/video/utilities/IGPUObjectFromAssetConverter.h

Lines changed: 0 additions & 163 deletions
Original file line numberDiff line numberDiff line change
@@ -11,128 +11,6 @@
1111
#include "nbl/video/ILogicalDevice.h"
1212

1313
#if 0
14-
auto IGPUObjectFromAssetConverter::create(const asset::ICPUAccelerationStructure** _begin, const asset::ICPUAccelerationStructure** _end, SParams& _params) -> created_gpu_object_array<asset::ICPUAccelerationStructure>
15-
{
16-
const size_t assetCount = std::distance(_begin, _end);
17-
auto res = core::make_refctd_dynamic_array<created_gpu_object_array<asset::ICPUAccelerationStructure> >(assetCount);
18-
auto toCreateAndBuild = std::vector<const asset::ICPUAccelerationStructure*>();
19-
auto buildRangeInfos = std::vector<IGPUAccelerationStructure::BuildRangeInfo*>();
20-
toCreateAndBuild.reserve(assetCount);
21-
buildRangeInfos.reserve(assetCount);
22-
// Lambda function: creates the acceleration structure and It's buffer
23-
auto allocateBufferAndCreateAccelerationStructure = [&](size_t asSize, const asset::ICPUAccelerationStructure* cpuas)
24-
{
25-
// Create buffer with cpuas->getAccelerationStructureSize
26-
IGPUBuffer::SCreationParams gpuBufParams = {};
27-
gpuBufParams.size = asSize;
28-
gpuBufParams.usage = core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
29-
auto gpubuf = _params.device->createBuffer(std::move(gpuBufParams));
30-
auto mreqs = gpubuf->getMemoryReqs();
31-
mreqs.memoryTypeBits &= _params.device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
32-
auto gpubufMem = _params.device->allocate(mreqs, gpubuf.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
33-
assert(gpubufMem.isValid());
34-
35-
// Create GPUAccelerationStructure with that buffer
36-
IGPUAccelerationStructure::SCreationParams creatationParams = {};
37-
creatationParams.bufferRange.buffer = gpubuf;
38-
creatationParams.bufferRange.offset = 0;
39-
creatationParams.bufferRange.size = asSize;
40-
creatationParams.flags = cpuas->getCreationParameters().flags;
41-
creatationParams.type = cpuas->getCreationParameters().type;
42-
return _params.device->createAccelerationStructure(std::move(creatationParams));
43-
};
44-
45-
for (ptrdiff_t i = 0u; i < assetCount; ++i)
46-
{
47-
const asset::ICPUAccelerationStructure* cpuas = _begin[i];
48-
49-
if(cpuas->hasBuildInfo())
50-
{
51-
// Add to toBuild vector of ICPUAccelerationStructure
52-
toCreateAndBuild.push_back(cpuas);
53-
buildRangeInfos.push_back(const_cast<IGPUAccelerationStructure::BuildRangeInfo*>(cpuas->getBuildRanges().begin()));
54-
}
55-
else if(cpuas->getAccelerationStructureSize() > 0)
56-
{
57-
res->operator[](i) = allocateBufferAndCreateAccelerationStructure(cpuas->getAccelerationStructureSize(), cpuas);
58-
}
59-
}
60-
61-
if(toCreateAndBuild.empty() == false)
62-
{
63-
bool hostBuildCommands = false; // get from SFeatures
64-
if(hostBuildCommands)
65-
{
66-
_NBL_TODO();
67-
}
68-
else
69-
{
70-
core::vector<const asset::ICPUBuffer*> cpuBufferDeps;
71-
constexpr uint32_t MaxGeometryPerBuildInfo = 16;
72-
constexpr uint32_t MaxBuffersPerGeometry = 3; // TrianglesData -> vertex+index+transformation
73-
cpuBufferDeps.reserve(assetCount * MaxGeometryPerBuildInfo * MaxBuffersPerGeometry);
74-
75-
// Get CPUBuffer Dependencies
76-
for (ptrdiff_t i = 0u; i < toCreateAndBuild.size(); ++i)
77-
{
78-
const asset::ICPUAccelerationStructure* cpuas = toCreateAndBuild[i];
79-
80-
auto buildInfo = cpuas->getBuildInfo();
81-
assert(buildInfo != nullptr);
82-
83-
auto geoms = buildInfo->getGeometries().begin();
84-
auto geomsCount = buildInfo->getGeometries().size();
85-
if(geomsCount == 0)
86-
{
87-
assert(false);
88-
continue;
89-
}
90-
91-
for(uint32_t g = 0; g < geomsCount; ++g)
92-
{
93-
const auto& geom = geoms[g];
94-
if(geom.type == asset::IAccelerationStructure::EGT_TRIANGLES)
95-
{
96-
if(geom.data.triangles.indexData.isValid())
97-
{
98-
auto cpuBuf = geom.data.triangles.indexData.buffer.get();
99-
cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT);
100-
cpuBufferDeps.push_back(cpuBuf);
101-
}
102-
if(geom.data.triangles.vertexData.isValid())
103-
{
104-
auto cpuBuf = geom.data.triangles.vertexData.buffer.get();
105-
cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT);
106-
cpuBufferDeps.push_back(cpuBuf);
107-
}
108-
if(geom.data.triangles.transformData.isValid())
109-
{
110-
auto cpuBuf = geom.data.triangles.transformData.buffer.get();
111-
cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT);
112-
cpuBufferDeps.push_back(cpuBuf);
113-
}
114-
}
115-
else if(geom.type == asset::IAccelerationStructure::EGT_AABBS)
116-
{
117-
if(geom.data.aabbs.data.isValid())
118-
{
119-
auto cpuBuf = geom.data.aabbs.data.buffer.get();
120-
cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT);
121-
cpuBufferDeps.push_back(cpuBuf);
122-
}
123-
}
124-
else if(geom.type == asset::IAccelerationStructure::EGT_INSTANCES)
125-
{
126-
if(geom.data.instances.data.isValid())
127-
{
128-
auto cpuBuf = geom.data.instances.data.buffer.get();
129-
cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT);
130-
cpuBufferDeps.push_back(cpuBuf);
131-
}
132-
}
133-
}
134-
}
135-
13614
// Convert CPUBuffer Deps to GPUBuffers
13715
core::vector<size_t> redirs = eliminateDuplicatesAndGenRedirs(cpuBufferDeps);
13816
auto gpuBufs = getGPUObjectsFromAssets<asset::ICPUBuffer>(cpuBufferDeps.data(), cpuBufferDeps.data()+cpuBufferDeps.size(), _params);
@@ -285,47 +163,6 @@ auto IGPUObjectFromAssetConverter::create(const asset::ICPUAccelerationStructure
285163
auto & gpuBuildInfo = buildGeomInfos[i];
286164
gpuBuildInfo.scratchAddr.buffer = gpuScratchBuf;
287165
}
288-
289-
// Record CommandBuffer for Building (We have Completed buildInfos + buildRanges for each CPUAS)
290-
auto & fence = _params.fences[EQU_COMPUTE];
291-
fence = _params.device->createFence(static_cast<IGPUFence::E_CREATE_FLAGS>(0));
292-
core::smart_refctd_ptr<IGPUCommandBuffer> cmdbuf = _params.perQueue[EQU_COMPUTE].cmdbuf;
293-
294-
IQueue::SSubmitInfo submit;
295-
{
296-
submit.commandBufferCount = 1u;
297-
submit.commandBuffers = &cmdbuf.get();
298-
submit.waitSemaphoreCount = 0u;
299-
submit.pWaitDstStageMask = nullptr;
300-
submit.pWaitSemaphores = nullptr;
301-
uint32_t waitSemaphoreCount = 0u;
302-
}
303-
304-
assert(cmdbuf->getState() == IGPUCommandBuffer::STATE::RECORDING);
305-
cmdbuf->buildAccelerationStructures({buildGeomInfos.data(),buildGeomInfos.data()+buildGeomInfos.size()},buildRangeInfos.data());
306-
cmdbuf->end();
307-
308-
// TODO for future to make this function more sophisticated: Compaction, MemoryLimit for Build
309-
310-
core::smart_refctd_ptr<IGPUSemaphore> sem;
311-
312-
if (_params.perQueue[EQU_COMPUTE].semaphore)
313-
sem = _params.device->createSemaphore();
314-
315-
auto* sem_ptr = sem.get();
316-
auto* fence_ptr = fence.get();
317-
318-
submit.signalSemaphoreCount = sem_ptr?1u:0u;
319-
submit.pSignalSemaphores = sem_ptr?&sem_ptr:nullptr;
320-
321-
_params.perQueue[EQU_COMPUTE].queue->submit(1u, &submit, fence_ptr);
322-
if (_params.perQueue[EQU_COMPUTE].semaphore)
323-
_params.perQueue[EQU_COMPUTE].semaphore[0] = std::move(sem);
324-
}
325-
}
326-
327-
return res;
328-
}
329166
#endif
330167

331168
#endif

0 commit comments

Comments
 (0)