cauterize the Host Build API

devsh · devsh · commit 9a998cb6cdf2 · 2025-04-19T17:12:20.000+02:00
diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h
@@ -180,7 +180,7 @@ class CAssetConverter : public core::IReferenceCounted
 				BuildPreference preference : 2 = BuildPreference::Invalid;
 				uint8_t lowMemory : 1 = false;
 				//! things that control the build
-				uint8_t hostBuild : 1 = false;
+				uint8_t hostBuild : 1 = false; // DO NOT USE, will get overriden to false anyway
 				uint8_t compactAfterBuild : 1 = false;
 
 			protected:
@@ -930,16 +930,16 @@ class CAssetConverter : public core::IReferenceCounted
 			IUtilities* utilities = nullptr;
 			// optional, last submit (compute, transfer if no compute needed) signals these in addition to the scratch semaphore
 			std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> extraSignalSemaphores = {};
-#ifdef NBL_ACCELERATION_STRUCTURE_CONVERSION
+			// specific to mip-map recomputation, these are okay defaults for the size of our Descriptor Indexed temporary descriptor set
+			uint32_t sampledImageBindingCount = 1<<10;
+			uint32_t storageImageBindingCount = 11<<10;
 			// specific to Acceleration Structure Build, they need to be at least as large as the largest amount of scratch required for an AS build
-			CAsyncSingleBufferSubAllocatorST</*TODO: try uint64_t GP Address Allocator*/>* scratchForDeviceASBuild = nullptr;
+			CAsyncSingleBufferSubAllocatorST</*using 32bit cause who uses 4GB of scratch for a build!?*/>* scratchForDeviceASBuild = nullptr;
 			std::pmr::memory_resource* scratchForHostASBuild = nullptr;
 			// needs to service allocations without limit, unlike the above where failure will just force a flush and performance of already queued up builds
 			IDeviceMemoryAllocator* compactedASAllocator = nullptr;
-#endif
-			// specific to mip-map recomputation, these are okay defaults for the size of our Descriptor Indexed temporary descriptor set
-			uint32_t sampledImageBindingCount = 1<<10;
-			uint32_t storageImageBindingCount = 11<<10;
+			// How many extra threads you want to use for AS Builds
+			uint16_t extraHostASBuildThreads = 0;
 		};
         struct SReserveResult final
         {
@@ -961,18 +961,40 @@ class CAssetConverter : public core::IReferenceCounted
 				// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdCopyBufferToImage.html#VUID-vkCmdCopyBufferToImage-commandBuffer-07739
 				inline core::bitflag<IQueue::FAMILY_FLAGS> getRequiredQueueFlags() const {return m_queueFlags;}
 
-#ifdef NBL_ACCELERATION_STRUCTURE_CONVERSION
 				// This is just enough memory to build the Acceleration Structures one by one waiting for each Device Build to complete inbetween. If 0 there are no Device AS Builds or Compactions to perform.
-				inline uint64_t getMinASBuildScratchSize(const bool forHostOps) const {return m_minASBuildScratchSize[forHostOps];}
+				inline uint64_t getMinASBuildScratchSize(const bool forHostOps) const
+				{
+					assert(m_minASBuildScratchSize[forHostOps]<=m_maxASBuildScratchSize[forHostOps]);
+					assert((forHostOps ? willHostASBuild():willDeviceASBuild()) == (m_maxASBuildScratchSize[forHostOps]>0));
+					return m_minASBuildScratchSize[forHostOps];
+				}
 				// Enough memory to build and compact all the Acceleration Structures at once, obviously respecting order of BLAS (build->compact) -> TLAS (build->compact)
-				inline uint64_t getMaxASBuildScratchSize(const bool forHostOps) const {return m_maxASBuildScratchSize[forHostOps];}
+				inline uint64_t getMaxASBuildScratchSize(const bool forHostOps) const
+				{
+					assert(m_minASBuildScratchSize[forHostOps]<=m_maxASBuildScratchSize[forHostOps]);
+					assert((forHostOps ? willHostASBuild():willDeviceASBuild()) == (m_maxASBuildScratchSize[forHostOps]>0));
+					return m_maxASBuildScratchSize[forHostOps];
+				}
 				// What usage flags your scratch buffer must have, if returns NONE means are no Device AS Builds to perform.
-				inline auto getASBuildScratchUsages() const {return m_ASBuildScratchUsages;}
+				inline auto getASBuildScratchUsages() const
+				{
+					assert((m_ASBuildScratchUsages!=IGPUBuffer::E_USAGE_FLAGS::EUF_NONE)==willDeviceASBuild());
+					return m_ASBuildScratchUsages;
+				}
+				// tells you if you need to provide a valid `SConvertParams::scratchForDeviceASBuild`
+				inline bool willDeviceASBuild() const {return m_willDeviceBuildSomeAS;}
 				// tells you if you need to provide a valid `SConvertParams::scratchForHostASBuild`
-				inline bool willHostASBuild() const {return m_willHostBuildSomeAS;}
+				inline bool willHostASBuild() const
+				{
+					assert(m_willHostBuildSomeAS==false); // host builds not supported yet
+					return m_willHostBuildSomeAS;
+				}
 				// tells you if you need to provide a valid `SConvertParams::compactedASAllocator`
-				inline bool willCompactAS() const {return m_willHostBuildSomeAS;}
-#endif
+				inline bool willCompactAS() const
+				{
+					assert((willDeviceASBuild()||willHostASBuild())==m_willCompactSomeAS);
+					return m_willCompactSomeAS;
+				}
 
 				//
 				inline operator bool() const {return bool(m_converter);}
@@ -1064,10 +1086,13 @@ class CAssetConverter : public core::IReferenceCounted
 				core::vector<SConvReqAccelerationStructure<asset::ICPUBottomLevelAccelerationStructure>> m_blasConversions[2];
 				core::vector<SConvReqAccelerationStructure<asset::ICPUTopLevelAccelerationStructure>> m_tlasConversions[2];
 
-				//
+				// 0 for device builds, 1 for host builds
 				uint64_t m_minASBuildScratchSize[2] = {0,0};
 				uint64_t m_maxASBuildScratchSize[2] = {0,0};
+				// is there even more than one usage needed?
 				core::bitflag<IGPUBuffer::E_USAGE_FLAGS> m_ASBuildScratchUsages = IGPUBuffer::E_USAGE_FLAGS::EUF_NONE;
+				// TODO: do we need those bools?
+				uint8_t m_willDeviceBuildSomeAS : 1 = false;
 				uint8_t m_willHostBuildSomeAS : 1 = false;
 				uint8_t m_willCompactSomeAS : 1 = false;
 
diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -116,10 +116,15 @@ bool CAssetConverter::acceleration_structure_patch_base::valid(const ILogicalDev
 	if (allowDataAccess && !limits.rayTracingPositionFetch)
 		return false;
 	// can always build with the device
+	if (hostBuild)
 #ifdef NBL_ACCELERATION_STRUCTURE_CONVERSION_HOST_READY
-	if (hostBuild && !features.accelerationStructureHostCommands)
+	if (!features.accelerationStructureHostCommands)
 #endif
+	{
+		if (auto logger=device->getLogger();logger)
+			logger->log("Host Acceleration Structure Builds are not yet supported!",system::ILogger::ELL_ERROR);
 		hostBuild = false;
+	}
 	return true;
 }
 CAssetConverter::patch_impl_t<ICPUBottomLevelAccelerationStructure>::patch_impl_t(const ICPUBottomLevelAccelerationStructure* blas)
@@ -4404,5 +4409,185 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 	return retval;
 }
 
+#if 0
+	// Lots of extra work, is why we didn't want to pursue it:
+	// - TLAS builds should happen semi-concurrently to BLAS, but need to know what TLAS needs what BLAS to finish (scheduling)
+	//   + also device TLAS builds should know what Host Built BLAS they depend on, so that `pool.work()` is called until the BLAS's associated deferred op signals COMPLETE
+	// - any AS should enqueue in a weird way with a sort of RLE, we allocate scratch until we can't then build whatever we can
+	// - the list of outstanding BLAS and TLAS to build should get updated periodically
+	// - overflow callbacks should call back into the BLAS and TLAS enqueuers and `pool.work()`
+	struct ASBuilderPool
+	{
+		public:
+			struct Worker
+			{
+				public:
+					inline Worker(const ASBuilderPool* _pool) : pool(_pool), pushCount(0), executor(execute) {}
+					inline ~Worker() {executor.join();}
+
+					inline void push(smart_refctd_ptr<IDeferredOperation>&& task)
+					{
+						std::lock_guard(queueLock);
+						tasks.push_back(std::move(task));
+						pushCount.fetch_add(1);
+						pushCount.notify_one();
+					}
+
+				private:
+					inline void execute()
+					{
+						uint64_t oldTaskCount = 0;
+						uint32_t taskIx = 0;
+						while (pool->stop.test())
+						{
+							while (pushCount.load())
+								pushCount.wait(oldTaskCount);
+							size_t taskCount;
+							IDeferredOperation* task;
+							// grab the task under a lock so we're not in danger of vector reallocating
+							{
+								std::lock_guard(queueLock);
+								taskCount = tasks.size();
+								task = tasks[taskIx].get();
+							}
+							switch (task->execute())
+							{
+								case IDeferredOperation::STATUS::THREAD_IDLE:
+									taskIx++; // next task
+									break;
+								default:
+								{
+									std::lock_guard(queueLock);
+									tasks.erase(tasks.begin()+taskIx);
+									break;
+								}
+							}
+							if (taskIx>=taskCount)
+								taskIx = 0;
+						}
+					}
+
+					std::mutex queueLock;
+					const ASBuilderPool* pool;
+					std::atomic_uint64_t pushCount;
+					std::thread executor;
+					core::vector<smart_refctd_ptr<IDeferredOperation>> tasks;
+			};
+
+			inline ASBuilderPool(const uint16_t _workerCount, system::logger_opt_ptr _logger) : stop(), workerCount(_workerCount), nextWorkerPush(0), logger(_logger)
+			{
+				workers = std::make_unique<Worker[]>(workerCount);
+			}
+			inline ~ASBuilderPool()
+			{
+				finish();
+			}
+
+			inline void finish()
+			{
+				while (work()) {}
+				stop.test_and_set();
+				stop.notify_one();
+				workers = nullptr;
+			}
+
+			struct Build
+			{
+				smart_refctd_ptr<IDeferredOperation> op;
+				// WRONG: for every deferred op, there are multiple `gpuObj` and `hash` that get built by it
+				IGPUAccelerationStructure* gpuObj;
+				core::blake3_hash_t* hash;
+			};
+			inline void push(Build&& build)
+			{
+				auto op = build.op.get();
+				if (!op->isPending())
+				{
+					logger.log("Host Acceleration Structure failed for \"%s\"",system::ILogger::ELL_ERROR,build.gpuObj->getObjectDebugName());
+					// change the content hash on the reverse map to a NoContentHash
+					*build.hash = CHashCache::NoContentHash;
+					return;
+				}
+				// there's no true best way to pick the worker with least work
+				for (uint16_t i=0; i<min<uint16_t>(op->getMaxConcurrency()-1,workerCount); i++)
+					workers[(nextWorkerPush++)%workerCount].push(smart_refctd_ptr<IDeferredOperation>(op));
+				buildsInProgress.push_back(std::move(build));
+			}
+
+			inline bool empty() const {return buildsInProgress.empty();}
+
+			// The idea is to somehow get the overflow callbacks to call this
+			inline bool work()
+			{
+				if (empty())
+					return;
+				auto build = buildsInProgress.begin()+buildIx;
+				switch (build->op->execute())
+				{
+					case IDeferredOperation::STATUS::THREAD_IDLE:
+						buildIx++; // next task
+						break;
+					case IDeferredOperation::STATUS::_ERROR:
+						logger.log("Host Acceleration Structure failed for \"%s\"",system::ILogger::ELL_ERROR,build->gpuObj->getObjectDebugName());
+						// change the content hash on the reverse map to a NoContentHash
+						*build->hash = CHashCache::NoContentHash;
+						[[fallthrough]];
+					default:
+					{
+						buildsInProgress.erase(build);
+						break;
+					}
+				}
+				if (buildIx>=buildsInProgress.size())
+					buildIx = 0;
+				return buildsInProgress.empty();
+			}
+
+			std::atomic_flag stop;
+
+		private:
+			uint16_t workerCount;
+			uint16_t nextWorkerPush = 0;
+			system::logger_opt_ptr logger;
+			std::unique_ptr<Worker[]> workers;
+			core::vector<Build> buildsInProgress;
+			uint32_t buildIx = 0;
+	};
+	ASBuilderPool hostBuilders(params.extraHostASBuildThreads,logger);
+
+	// crappy pseudocode
+	auto hostBLASConvIt = reservations.m_blasConversions[1].begin();
+	auto hostBLASConvEnd = reservations.m_blasConversions[1].end();
+	while (hostBLASConvIt!=hostBLASConvEnd)
+	{
+		auto op = device->createDeferredOperation();
+		if (!op)
+			error, mark failure in staging;
+		core::vector<IGPUBottomLevelAccelerationStructure::HostBuildInfo> infos;
+		core::vector<IGPUBottomLevelAccelerationStructure::BuildRangeInfo> ranges;
+		for (; hostBLASConvIt!=hostBLASConvEnd; hostBLASConvIt++)
+		{
+			void* scratch = hostBLASConvIt->scratchSize;
+			if (!scratch)
+			{
+				if (infos.empty() && hostBuilders.empty())
+					error mark failure in staging, can't even enqueue 1 build';
+				else
+					break;
+			}
+
+			auto asset = hostBLASConvIt->canonical;
+			asset->getGeometryPrimitiveCounts();
+			ranges.push_back({
+				.primitiveCount = 0,
+				.primitiveByteOffset = 0,
+				.firstVertex = 0,
+				.transformByteOffset = 0
+			});
+		}
+		if (!device->buildAccelerationStructures(op.get(),infos,ranges.data()))
+			continue;
+	}
+#endif
 }
 }