Skip to content

Commit 9a998cb

Browse files
author
devsh
committed
cauterize the Host Build API
1 parent 96546e0 commit 9a998cb

File tree

2 files changed

+226
-16
lines changed

2 files changed

+226
-16
lines changed

include/nbl/video/utilities/CAssetConverter.h

Lines changed: 40 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ class CAssetConverter : public core::IReferenceCounted
180180
BuildPreference preference : 2 = BuildPreference::Invalid;
181181
uint8_t lowMemory : 1 = false;
182182
//! things that control the build
183-
uint8_t hostBuild : 1 = false;
183+
uint8_t hostBuild : 1 = false; // DO NOT USE, will get overriden to false anyway
184184
uint8_t compactAfterBuild : 1 = false;
185185

186186
protected:
@@ -930,16 +930,16 @@ class CAssetConverter : public core::IReferenceCounted
930930
IUtilities* utilities = nullptr;
931931
// optional, last submit (compute, transfer if no compute needed) signals these in addition to the scratch semaphore
932932
std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> extraSignalSemaphores = {};
933-
#ifdef NBL_ACCELERATION_STRUCTURE_CONVERSION
933+
// specific to mip-map recomputation, these are okay defaults for the size of our Descriptor Indexed temporary descriptor set
934+
uint32_t sampledImageBindingCount = 1<<10;
935+
uint32_t storageImageBindingCount = 11<<10;
934936
// specific to Acceleration Structure Build, they need to be at least as large as the largest amount of scratch required for an AS build
935-
CAsyncSingleBufferSubAllocatorST</*TODO: try uint64_t GP Address Allocator*/>* scratchForDeviceASBuild = nullptr;
937+
CAsyncSingleBufferSubAllocatorST</*using 32bit cause who uses 4GB of scratch for a build!?*/>* scratchForDeviceASBuild = nullptr;
936938
std::pmr::memory_resource* scratchForHostASBuild = nullptr;
937939
// needs to service allocations without limit, unlike the above where failure will just force a flush and performance of already queued up builds
938940
IDeviceMemoryAllocator* compactedASAllocator = nullptr;
939-
#endif
940-
// specific to mip-map recomputation, these are okay defaults for the size of our Descriptor Indexed temporary descriptor set
941-
uint32_t sampledImageBindingCount = 1<<10;
942-
uint32_t storageImageBindingCount = 11<<10;
941+
// How many extra threads you want to use for AS Builds
942+
uint16_t extraHostASBuildThreads = 0;
943943
};
944944
struct SReserveResult final
945945
{
@@ -961,18 +961,40 @@ class CAssetConverter : public core::IReferenceCounted
961961
// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdCopyBufferToImage.html#VUID-vkCmdCopyBufferToImage-commandBuffer-07739
962962
inline core::bitflag<IQueue::FAMILY_FLAGS> getRequiredQueueFlags() const {return m_queueFlags;}
963963

964-
#ifdef NBL_ACCELERATION_STRUCTURE_CONVERSION
965964
// This is just enough memory to build the Acceleration Structures one by one waiting for each Device Build to complete inbetween. If 0 there are no Device AS Builds or Compactions to perform.
966-
inline uint64_t getMinASBuildScratchSize(const bool forHostOps) const {return m_minASBuildScratchSize[forHostOps];}
965+
inline uint64_t getMinASBuildScratchSize(const bool forHostOps) const
966+
{
967+
assert(m_minASBuildScratchSize[forHostOps]<=m_maxASBuildScratchSize[forHostOps]);
968+
assert((forHostOps ? willHostASBuild():willDeviceASBuild()) == (m_maxASBuildScratchSize[forHostOps]>0));
969+
return m_minASBuildScratchSize[forHostOps];
970+
}
967971
// Enough memory to build and compact all the Acceleration Structures at once, obviously respecting order of BLAS (build->compact) -> TLAS (build->compact)
968-
inline uint64_t getMaxASBuildScratchSize(const bool forHostOps) const {return m_maxASBuildScratchSize[forHostOps];}
972+
inline uint64_t getMaxASBuildScratchSize(const bool forHostOps) const
973+
{
974+
assert(m_minASBuildScratchSize[forHostOps]<=m_maxASBuildScratchSize[forHostOps]);
975+
assert((forHostOps ? willHostASBuild():willDeviceASBuild()) == (m_maxASBuildScratchSize[forHostOps]>0));
976+
return m_maxASBuildScratchSize[forHostOps];
977+
}
969978
// What usage flags your scratch buffer must have, if returns NONE means are no Device AS Builds to perform.
970-
inline auto getASBuildScratchUsages() const {return m_ASBuildScratchUsages;}
979+
inline auto getASBuildScratchUsages() const
980+
{
981+
assert((m_ASBuildScratchUsages!=IGPUBuffer::E_USAGE_FLAGS::EUF_NONE)==willDeviceASBuild());
982+
return m_ASBuildScratchUsages;
983+
}
984+
// tells you if you need to provide a valid `SConvertParams::scratchForDeviceASBuild`
985+
inline bool willDeviceASBuild() const {return m_willDeviceBuildSomeAS;}
971986
// tells you if you need to provide a valid `SConvertParams::scratchForHostASBuild`
972-
inline bool willHostASBuild() const {return m_willHostBuildSomeAS;}
987+
inline bool willHostASBuild() const
988+
{
989+
assert(m_willHostBuildSomeAS==false); // host builds not supported yet
990+
return m_willHostBuildSomeAS;
991+
}
973992
// tells you if you need to provide a valid `SConvertParams::compactedASAllocator`
974-
inline bool willCompactAS() const {return m_willHostBuildSomeAS;}
975-
#endif
993+
inline bool willCompactAS() const
994+
{
995+
assert((willDeviceASBuild()||willHostASBuild())==m_willCompactSomeAS);
996+
return m_willCompactSomeAS;
997+
}
976998

977999
//
9781000
inline operator bool() const {return bool(m_converter);}
@@ -1064,10 +1086,13 @@ class CAssetConverter : public core::IReferenceCounted
10641086
core::vector<SConvReqAccelerationStructure<asset::ICPUBottomLevelAccelerationStructure>> m_blasConversions[2];
10651087
core::vector<SConvReqAccelerationStructure<asset::ICPUTopLevelAccelerationStructure>> m_tlasConversions[2];
10661088

1067-
//
1089+
// 0 for device builds, 1 for host builds
10681090
uint64_t m_minASBuildScratchSize[2] = {0,0};
10691091
uint64_t m_maxASBuildScratchSize[2] = {0,0};
1092+
// is there even more than one usage needed?
10701093
core::bitflag<IGPUBuffer::E_USAGE_FLAGS> m_ASBuildScratchUsages = IGPUBuffer::E_USAGE_FLAGS::EUF_NONE;
1094+
// TODO: do we need those bools?
1095+
uint8_t m_willDeviceBuildSomeAS : 1 = false;
10711096
uint8_t m_willHostBuildSomeAS : 1 = false;
10721097
uint8_t m_willCompactSomeAS : 1 = false;
10731098

src/nbl/video/utilities/CAssetConverter.cpp

Lines changed: 186 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,10 +116,15 @@ bool CAssetConverter::acceleration_structure_patch_base::valid(const ILogicalDev
116116
if (allowDataAccess && !limits.rayTracingPositionFetch)
117117
return false;
118118
// can always build with the device
119+
if (hostBuild)
119120
#ifdef NBL_ACCELERATION_STRUCTURE_CONVERSION_HOST_READY
120-
if (hostBuild && !features.accelerationStructureHostCommands)
121+
if (!features.accelerationStructureHostCommands)
121122
#endif
123+
{
124+
if (auto logger=device->getLogger();logger)
125+
logger->log("Host Acceleration Structure Builds are not yet supported!",system::ILogger::ELL_ERROR);
122126
hostBuild = false;
127+
}
123128
return true;
124129
}
125130
CAssetConverter::patch_impl_t<ICPUBottomLevelAccelerationStructure>::patch_impl_t(const ICPUBottomLevelAccelerationStructure* blas)
@@ -4404,5 +4409,185 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
44044409
return retval;
44054410
}
44064411

4412+
#if 0
4413+
// Lots of extra work, is why we didn't want to pursue it:
4414+
// - TLAS builds should happen semi-concurrently to BLAS, but need to know what TLAS needs what BLAS to finish (scheduling)
4415+
// + also device TLAS builds should know what Host Built BLAS they depend on, so that `pool.work()` is called until the BLAS's associated deferred op signals COMPLETE
4416+
// - any AS should enqueue in a weird way with a sort of RLE, we allocate scratch until we can't then build whatever we can
4417+
// - the list of outstanding BLAS and TLAS to build should get updated periodically
4418+
// - overflow callbacks should call back into the BLAS and TLAS enqueuers and `pool.work()`
4419+
struct ASBuilderPool
4420+
{
4421+
public:
4422+
struct Worker
4423+
{
4424+
public:
4425+
inline Worker(const ASBuilderPool* _pool) : pool(_pool), pushCount(0), executor(execute) {}
4426+
inline ~Worker() {executor.join();}
4427+
4428+
inline void push(smart_refctd_ptr<IDeferredOperation>&& task)
4429+
{
4430+
std::lock_guard(queueLock);
4431+
tasks.push_back(std::move(task));
4432+
pushCount.fetch_add(1);
4433+
pushCount.notify_one();
4434+
}
4435+
4436+
private:
4437+
inline void execute()
4438+
{
4439+
uint64_t oldTaskCount = 0;
4440+
uint32_t taskIx = 0;
4441+
while (pool->stop.test())
4442+
{
4443+
while (pushCount.load())
4444+
pushCount.wait(oldTaskCount);
4445+
size_t taskCount;
4446+
IDeferredOperation* task;
4447+
// grab the task under a lock so we're not in danger of vector reallocating
4448+
{
4449+
std::lock_guard(queueLock);
4450+
taskCount = tasks.size();
4451+
task = tasks[taskIx].get();
4452+
}
4453+
switch (task->execute())
4454+
{
4455+
case IDeferredOperation::STATUS::THREAD_IDLE:
4456+
taskIx++; // next task
4457+
break;
4458+
default:
4459+
{
4460+
std::lock_guard(queueLock);
4461+
tasks.erase(tasks.begin()+taskIx);
4462+
break;
4463+
}
4464+
}
4465+
if (taskIx>=taskCount)
4466+
taskIx = 0;
4467+
}
4468+
}
4469+
4470+
std::mutex queueLock;
4471+
const ASBuilderPool* pool;
4472+
std::atomic_uint64_t pushCount;
4473+
std::thread executor;
4474+
core::vector<smart_refctd_ptr<IDeferredOperation>> tasks;
4475+
};
4476+
4477+
inline ASBuilderPool(const uint16_t _workerCount, system::logger_opt_ptr _logger) : stop(), workerCount(_workerCount), nextWorkerPush(0), logger(_logger)
4478+
{
4479+
workers = std::make_unique<Worker[]>(workerCount);
4480+
}
4481+
inline ~ASBuilderPool()
4482+
{
4483+
finish();
4484+
}
4485+
4486+
inline void finish()
4487+
{
4488+
while (work()) {}
4489+
stop.test_and_set();
4490+
stop.notify_one();
4491+
workers = nullptr;
4492+
}
4493+
4494+
struct Build
4495+
{
4496+
smart_refctd_ptr<IDeferredOperation> op;
4497+
// WRONG: for every deferred op, there are multiple `gpuObj` and `hash` that get built by it
4498+
IGPUAccelerationStructure* gpuObj;
4499+
core::blake3_hash_t* hash;
4500+
};
4501+
inline void push(Build&& build)
4502+
{
4503+
auto op = build.op.get();
4504+
if (!op->isPending())
4505+
{
4506+
logger.log("Host Acceleration Structure failed for \"%s\"",system::ILogger::ELL_ERROR,build.gpuObj->getObjectDebugName());
4507+
// change the content hash on the reverse map to a NoContentHash
4508+
*build.hash = CHashCache::NoContentHash;
4509+
return;
4510+
}
4511+
// there's no true best way to pick the worker with least work
4512+
for (uint16_t i=0; i<min<uint16_t>(op->getMaxConcurrency()-1,workerCount); i++)
4513+
workers[(nextWorkerPush++)%workerCount].push(smart_refctd_ptr<IDeferredOperation>(op));
4514+
buildsInProgress.push_back(std::move(build));
4515+
}
4516+
4517+
inline bool empty() const {return buildsInProgress.empty();}
4518+
4519+
// The idea is to somehow get the overflow callbacks to call this
4520+
inline bool work()
4521+
{
4522+
if (empty())
4523+
return;
4524+
auto build = buildsInProgress.begin()+buildIx;
4525+
switch (build->op->execute())
4526+
{
4527+
case IDeferredOperation::STATUS::THREAD_IDLE:
4528+
buildIx++; // next task
4529+
break;
4530+
case IDeferredOperation::STATUS::_ERROR:
4531+
logger.log("Host Acceleration Structure failed for \"%s\"",system::ILogger::ELL_ERROR,build->gpuObj->getObjectDebugName());
4532+
// change the content hash on the reverse map to a NoContentHash
4533+
*build->hash = CHashCache::NoContentHash;
4534+
[[fallthrough]];
4535+
default:
4536+
{
4537+
buildsInProgress.erase(build);
4538+
break;
4539+
}
4540+
}
4541+
if (buildIx>=buildsInProgress.size())
4542+
buildIx = 0;
4543+
return buildsInProgress.empty();
4544+
}
4545+
4546+
std::atomic_flag stop;
4547+
4548+
private:
4549+
uint16_t workerCount;
4550+
uint16_t nextWorkerPush = 0;
4551+
system::logger_opt_ptr logger;
4552+
std::unique_ptr<Worker[]> workers;
4553+
core::vector<Build> buildsInProgress;
4554+
uint32_t buildIx = 0;
4555+
};
4556+
ASBuilderPool hostBuilders(params.extraHostASBuildThreads,logger);
4557+
4558+
// crappy pseudocode
4559+
auto hostBLASConvIt = reservations.m_blasConversions[1].begin();
4560+
auto hostBLASConvEnd = reservations.m_blasConversions[1].end();
4561+
while (hostBLASConvIt!=hostBLASConvEnd)
4562+
{
4563+
auto op = device->createDeferredOperation();
4564+
if (!op)
4565+
error, mark failure in staging;
4566+
core::vector<IGPUBottomLevelAccelerationStructure::HostBuildInfo> infos;
4567+
core::vector<IGPUBottomLevelAccelerationStructure::BuildRangeInfo> ranges;
4568+
for (; hostBLASConvIt!=hostBLASConvEnd; hostBLASConvIt++)
4569+
{
4570+
void* scratch = hostBLASConvIt->scratchSize;
4571+
if (!scratch)
4572+
{
4573+
if (infos.empty() && hostBuilders.empty())
4574+
error mark failure in staging, can't even enqueue 1 build';
4575+
else
4576+
break;
4577+
}
4578+
4579+
auto asset = hostBLASConvIt->canonical;
4580+
asset->getGeometryPrimitiveCounts();
4581+
ranges.push_back({
4582+
.primitiveCount = 0,
4583+
.primitiveByteOffset = 0,
4584+
.firstVertex = 0,
4585+
.transformByteOffset = 0
4586+
});
4587+
}
4588+
if (!device->buildAccelerationStructures(op.get(),infos,ranges.data()))
4589+
continue;
4590+
}
4591+
#endif
44074592
}
44084593
}

0 commit comments

Comments
 (0)