write the scratch allocation, build submits

devsh · devsh · commit d409894c973b · 2025-04-20T01:13:54.000+02:00
diff --git a/include/nbl/video/alloc/CAsyncSingleBufferSubAllocator.h b/include/nbl/video/alloc/CAsyncSingleBufferSubAllocator.h
@@ -157,7 +157,7 @@ class CAsyncSingleBufferSubAllocator
         template<typename... Args>
         inline size_type multi_allocate(uint32_t count, Args&&... args) noexcept
         {
-            return multi_alloc(decltype(deferredFrees)::default_wait(),count,std::forward<Args>(args)...);
+            return multi_allocate(TimelineEventHandlerBase::default_wait(),count,std::forward<Args>(args)...);
         }
         //! attempt to allocate, if fail (presumably because of fragmentation), then keep trying till timeout is reached
         template<class Clock=typename std::chrono::steady_clock, typename... Args>
diff --git a/include/nbl/video/asset_traits.h b/include/nbl/video/asset_traits.h
@@ -196,7 +196,7 @@ struct asset_traits<asset::ICPUBottomLevelAccelerationStructure>
 	// we don't need to descend during DFS into other assets
 	constexpr static inline bool HasChildren = true;
 	// the video type
-	using video_t = IGPUImageView;
+	using video_t = IGPUBottomLevelAccelerationStructure;
 	// lookup type
 	using lookup_t = const video_t*;
 };
diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h
@@ -934,7 +934,7 @@ class CAssetConverter : public core::IReferenceCounted
 			uint32_t sampledImageBindingCount = 1<<10;
 			uint32_t storageImageBindingCount = 11<<10;
 			// specific to Acceleration Structure Build, they need to be at least as large as the largest amount of scratch required for an AS build
-			CAsyncSingleBufferSubAllocatorST</*using 32bit cause who uses 4GB of scratch for a build!?*/>* scratchForDeviceASBuild = nullptr;
+			CAsyncSingleBufferSubAllocatorST<core::GeneralpurposeAddressAllocator<uint64_t>>* scratchForDeviceASBuild = nullptr;
 			std::pmr::memory_resource* scratchForHostASBuild = nullptr;
 			// needs to service allocations without limit, unlike the above where failure will just force a flush and performance of already queued up builds
 			IDeviceMemoryAllocator* compactedASAllocator = nullptr;
@@ -1068,11 +1068,12 @@ class CAssetConverter : public core::IReferenceCounted
 					constexpr static inline uint64_t WontCompact = (0x1ull<<48)-1;
 					inline bool compact() const {return compactedASWriteOffset!=WontCompact;}
 
-					using build_f = typename CPUAccelerationStructure::BUILD_FLAGS;
+					using build_f = typename asset_traits<CPUAccelerationStructure>::video_t::BUILD_FLAGS;
 					inline void setBuildFlags(const build_f _flags) {buildFlags = static_cast<uint16_t>(_flags);}
 					inline build_f getBuildFlags() const {return static_cast<build_f>(buildFlags);}
 
 
+					uint64_t scratchSize;
 					uint64_t compactedASWriteOffset : 48 = WontCompact;
 					uint64_t buildFlags : 16 = static_cast<uint16_t>(build_f::NONE);
 				};
diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h
@@ -577,35 +577,6 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
             return true;
         }
 
-        // --------------
-        // buildAccelerationStructures
-        // --------------
-#if 0 // TODO: port later when we have an example
-        //! WARNING: This function blocks the CPU and stalls the GPU!
-        inline void buildAccelerationStructures(IQueue* queue, const core::SRange<const IGPUAccelerationStructure::DeviceBuildGeometryInfo>& pInfos, IGPUAccelerationStructure::BuildRangeInfo* const* ppBuildRangeInfos)
-        {
-            core::smart_refctd_ptr<IGPUCommandPool> pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-            auto fence = m_device->createFence(static_cast<IGPUFence::E_CREATE_FLAGS>(0));
-            core::smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
-            m_device->createCommandBuffers(pool.get(), IGPUCommandBuffer::LEVEL::PRIMARY, 1u, &cmdbuf);
-            IQueue::SSubmitInfo submit;
-            {
-                submit.commandBufferCount = 1u;
-                submit.commandBuffers = &cmdbuf.get();
-                submit.waitSemaphoreCount = 0u;
-                submit.pWaitDstStageMask = nullptr;
-                submit.pWaitSemaphores = nullptr;
-            }
-
-            cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-            cmdbuf->buildAccelerationStructures(pInfos,ppBuildRangeInfos);
-            cmdbuf->end();
-
-            queue->submit(1u, &submit, fence.get());
-        
-            m_device->blockForFences(1u,&fence.get());
-        }
-#endif
         // --------------
         // updateImageViaStagingBuffer
         // --------------
diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -4245,14 +4245,16 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						rangeInfos.reserve(tlasCount);
 						auto recordBuilds = [&]()->void
 						{
+							if (buildInfos.empty())
+								return;
 							// rewrite the trackedBLASes pointers
 							for (auto& info : buildInfos)
 							{
 								const auto offset = info.trackedBLASes.data();
 								info.trackedBLASes = {trackedBLASes.data()+reinterpret_cast<const size_t&>(offset),info.trackedBLASes.size()};
 							}
 							//
-							if (!buildInfos.empty() && !computeCmdBuf->cmdbuf->buildAccelerationStructures({buildInfos},rangeInfos.data()))
+							if (!computeCmdBuf->cmdbuf->buildAccelerationStructures({buildInfos},rangeInfos.data()))
 							for (const auto& info : buildInfos)
 							{
 								const auto pFoundHash = findInStaging.operator()<ICPUTopLevelAccelerationStructure>(info.dstAS);
@@ -4263,27 +4265,56 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 							trackedBLASes.clear();
 						};
 						//
+						using scratch_allocator_t = std::remove_reference_t<decltype(*params.scratchForDeviceASBuild)>;
+						using addr_t = typename scratch_allocator_t::size_type;
+						const auto& limits = device->getPhysicalDevice()->getLimits();
 						for (const auto& tlasToBuild : tlasesToBuild)
 						{
 							const auto as = tlasToBuild.gpuObj;
 							const auto pFoundHash = findInStaging.operator()<ICPUTopLevelAccelerationStructure>(as);
 							const auto instances = tlasToBuild.canonical->getInstances();
+							const auto instanceCount = static_cast<uint32_t>(instances.size());
+							const auto instanceSize = true ? sizeof(IGPUTopLevelAccelerationStructure::DevicePolymorphicInstance):sizeof(IGPUTopLevelAccelerationStructure::DevicePolymorphicInstance);
 							// allocate scratch and build inputs
-							// if fail then flush
-							// stream the info in && check dependents
+							addr_t offsets[2] = {scratch_allocator_t::invalid_value,scratch_allocator_t::invalid_value};
+							{
+								const addr_t sizes[2] = {tlasToBuild.scratchSize,instanceSize*instanceCount};
+								const addr_t alignments[2] = {limits.minAccelerationStructureScratchOffsetAlignment,8}; // TODO: check address allocator can service these alignments
+								const size_t worstSize = core::alignUp(sizes[0],alignments[1])+sizes[1];
+								// it will never fit (prevent CPU hangs)
+								if (const auto& addrAlloc=params.scratchForDeviceASBuild->getAddressAllocator(); addrAlloc.get_free_size()+addrAlloc.get_allocated_size()<worstSize)
+								{
+									markFailureInStaging(as,pFoundHash);
+									continue;
+								}
+								// if fail then flush and keep trying till space is made
+								for (uint32_t t=0; params.scratchForDeviceASBuild->multi_allocate(2u,&offsets[0],&sizes[0],&alignments[0])!=0u; t++)
+								if (t==1) // don't flush right away cause allocator not defragmented yet
+								{
+									recordBuilds();
+									drainCompute();
+								}
+								params.scratchForDeviceASBuild->multi_deallocate(2,&offsets[0],&sizes[0],params.compute->getFutureScratchSemaphore());
+							}
+							// stream the instance/geometry input in && check dependents
+							// unfortunately can't count on large ReBAR heaps so we can't force the `scratchBuffer` to be mapped and writable 
+							for (const auto& instance : instances)
+							{
+								instance.instance;
+							}
 							// prepare build infos
 							auto& buildInfo = buildInfos.emplace_back();
-							buildInfo.scratch = {};
-//							buildInfo.buildFlags = tlasToBuild.getBuildFlags();
+							buildInfo.scratch = {.offset=offsets[0],.buffer=smart_refctd_ptr<IGPUBuffer>(params.scratchForDeviceASBuild->getBuffer())};
+							buildInfo.buildFlags = tlasToBuild.getBuildFlags();
 							buildInfo.dstAS = as;
-							buildInfo.instanceData = {};
+							buildInfo.instanceData = {.offset=offsets[1],.buffer=smart_refctd_ptr<IGPUBuffer>(params.scratchForDeviceASBuild->getBuffer())};
 							// be based cause vectors can grow
 							{
 								const auto offset = trackedBLASes.size();
 								using p_p_BLAS_t = const IGPUBottomLevelAccelerationStructure**;
-								buildInfo.trackedBLASes = {reinterpret_cast<const p_p_BLAS_t&>(offset),instances.size()};
+								buildInfo.trackedBLASes = {reinterpret_cast<const p_p_BLAS_t&>(offset),instanceCount};
 							}
-							rangeInfos.emplace_back(instances.size(),0u);
+							rangeInfos.emplace_back(instanceCount,0u);
 						}
 						recordBuilds();
 						computeCmdBuf->cmdbuf->endDebugMarker();
@@ -4298,7 +4329,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						computeCmdBuf->cmdbuf->writeAccelerationStructureProperties(compactions,IQueryPool::TYPE::ACCELERATION_STRUCTURE_COMPACTED_SIZE,queryPool.get(),0)
 						)
 					{
-// drain compute
+						// submit cause host needs to read the queries
+						drainCompute();
 						// get queries
 						core::vector<size_t> sizes(compactions.size());
 						if (device->getQueryPoolResults(

Original file line number	Diff line number	Diff line change
`@@ -157,7 +157,7 @@ class CAsyncSingleBufferSubAllocator`
`157`	`157`	`template<typename... Args>`
`158`	`158`	`inline size_type multi_allocate(uint32_t count, Args&&... args) noexcept`
`159`	`159`	`{`
`160`		`- return multi_alloc(decltype(deferredFrees)::default_wait(),count,std::forward<Args>(args)...);`
	`160`	`+ return multi_allocate(TimelineEventHandlerBase::default_wait(),count,std::forward<Args>(args)...);`
`161`	`161`	`}`
`162`	`162`	`//! attempt to allocate, if fail (presumably because of fragmentation), then keep trying till timeout is reached`
`163`	`163`	`template<class Clock=typename std::chrono::steady_clock, typename... Args>`