Skip to content

Commit d409894

Browse files
author
devsh
committed
write the scratch allocation, build submits
1 parent 31163b3 commit d409894

File tree

5 files changed

+46
-42
lines changed

5 files changed

+46
-42
lines changed

include/nbl/video/alloc/CAsyncSingleBufferSubAllocator.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ class CAsyncSingleBufferSubAllocator
157157
template<typename... Args>
158158
inline size_type multi_allocate(uint32_t count, Args&&... args) noexcept
159159
{
160-
return multi_alloc(decltype(deferredFrees)::default_wait(),count,std::forward<Args>(args)...);
160+
return multi_allocate(TimelineEventHandlerBase::default_wait(),count,std::forward<Args>(args)...);
161161
}
162162
//! attempt to allocate, if fail (presumably because of fragmentation), then keep trying till timeout is reached
163163
template<class Clock=typename std::chrono::steady_clock, typename... Args>

include/nbl/video/asset_traits.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ struct asset_traits<asset::ICPUBottomLevelAccelerationStructure>
196196
// we don't need to descend during DFS into other assets
197197
constexpr static inline bool HasChildren = true;
198198
// the video type
199-
using video_t = IGPUImageView;
199+
using video_t = IGPUBottomLevelAccelerationStructure;
200200
// lookup type
201201
using lookup_t = const video_t*;
202202
};

include/nbl/video/utilities/CAssetConverter.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -934,7 +934,7 @@ class CAssetConverter : public core::IReferenceCounted
934934
uint32_t sampledImageBindingCount = 1<<10;
935935
uint32_t storageImageBindingCount = 11<<10;
936936
// specific to Acceleration Structure Build, they need to be at least as large as the largest amount of scratch required for an AS build
937-
CAsyncSingleBufferSubAllocatorST</*using 32bit cause who uses 4GB of scratch for a build!?*/>* scratchForDeviceASBuild = nullptr;
937+
CAsyncSingleBufferSubAllocatorST<core::GeneralpurposeAddressAllocator<uint64_t>>* scratchForDeviceASBuild = nullptr;
938938
std::pmr::memory_resource* scratchForHostASBuild = nullptr;
939939
// needs to service allocations without limit, unlike the above where failure will just force a flush and performance of already queued up builds
940940
IDeviceMemoryAllocator* compactedASAllocator = nullptr;
@@ -1068,11 +1068,12 @@ class CAssetConverter : public core::IReferenceCounted
10681068
constexpr static inline uint64_t WontCompact = (0x1ull<<48)-1;
10691069
inline bool compact() const {return compactedASWriteOffset!=WontCompact;}
10701070

1071-
using build_f = typename CPUAccelerationStructure::BUILD_FLAGS;
1071+
using build_f = typename asset_traits<CPUAccelerationStructure>::video_t::BUILD_FLAGS;
10721072
inline void setBuildFlags(const build_f _flags) {buildFlags = static_cast<uint16_t>(_flags);}
10731073
inline build_f getBuildFlags() const {return static_cast<build_f>(buildFlags);}
10741074

10751075

1076+
uint64_t scratchSize;
10761077
uint64_t compactedASWriteOffset : 48 = WontCompact;
10771078
uint64_t buildFlags : 16 = static_cast<uint16_t>(build_f::NONE);
10781079
};

include/nbl/video/utilities/IUtilities.h

Lines changed: 0 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -577,35 +577,6 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
577577
return true;
578578
}
579579

580-
// --------------
581-
// buildAccelerationStructures
582-
// --------------
583-
#if 0 // TODO: port later when we have an example
584-
//! WARNING: This function blocks the CPU and stalls the GPU!
585-
inline void buildAccelerationStructures(IQueue* queue, const core::SRange<const IGPUAccelerationStructure::DeviceBuildGeometryInfo>& pInfos, IGPUAccelerationStructure::BuildRangeInfo* const* ppBuildRangeInfos)
586-
{
587-
core::smart_refctd_ptr<IGPUCommandPool> pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
588-
auto fence = m_device->createFence(static_cast<IGPUFence::E_CREATE_FLAGS>(0));
589-
core::smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
590-
m_device->createCommandBuffers(pool.get(), IGPUCommandBuffer::LEVEL::PRIMARY, 1u, &cmdbuf);
591-
IQueue::SSubmitInfo submit;
592-
{
593-
submit.commandBufferCount = 1u;
594-
submit.commandBuffers = &cmdbuf.get();
595-
submit.waitSemaphoreCount = 0u;
596-
submit.pWaitDstStageMask = nullptr;
597-
submit.pWaitSemaphores = nullptr;
598-
}
599-
600-
cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
601-
cmdbuf->buildAccelerationStructures(pInfos,ppBuildRangeInfos);
602-
cmdbuf->end();
603-
604-
queue->submit(1u, &submit, fence.get());
605-
606-
m_device->blockForFences(1u,&fence.get());
607-
}
608-
#endif
609580
// --------------
610581
// updateImageViaStagingBuffer
611582
// --------------

src/nbl/video/utilities/CAssetConverter.cpp

Lines changed: 41 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4245,14 +4245,16 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
42454245
rangeInfos.reserve(tlasCount);
42464246
auto recordBuilds = [&]()->void
42474247
{
4248+
if (buildInfos.empty())
4249+
return;
42484250
// rewrite the trackedBLASes pointers
42494251
for (auto& info : buildInfos)
42504252
{
42514253
const auto offset = info.trackedBLASes.data();
42524254
info.trackedBLASes = {trackedBLASes.data()+reinterpret_cast<const size_t&>(offset),info.trackedBLASes.size()};
42534255
}
42544256
//
4255-
if (!buildInfos.empty() && !computeCmdBuf->cmdbuf->buildAccelerationStructures({buildInfos},rangeInfos.data()))
4257+
if (!computeCmdBuf->cmdbuf->buildAccelerationStructures({buildInfos},rangeInfos.data()))
42564258
for (const auto& info : buildInfos)
42574259
{
42584260
const auto pFoundHash = findInStaging.operator()<ICPUTopLevelAccelerationStructure>(info.dstAS);
@@ -4263,27 +4265,56 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
42634265
trackedBLASes.clear();
42644266
};
42654267
//
4268+
using scratch_allocator_t = std::remove_reference_t<decltype(*params.scratchForDeviceASBuild)>;
4269+
using addr_t = typename scratch_allocator_t::size_type;
4270+
const auto& limits = device->getPhysicalDevice()->getLimits();
42664271
for (const auto& tlasToBuild : tlasesToBuild)
42674272
{
42684273
const auto as = tlasToBuild.gpuObj;
42694274
const auto pFoundHash = findInStaging.operator()<ICPUTopLevelAccelerationStructure>(as);
42704275
const auto instances = tlasToBuild.canonical->getInstances();
4276+
const auto instanceCount = static_cast<uint32_t>(instances.size());
4277+
const auto instanceSize = true ? sizeof(IGPUTopLevelAccelerationStructure::DevicePolymorphicInstance):sizeof(IGPUTopLevelAccelerationStructure::DevicePolymorphicInstance);
42714278
// allocate scratch and build inputs
4272-
// if fail then flush
4273-
// stream the info in && check dependents
4279+
addr_t offsets[2] = {scratch_allocator_t::invalid_value,scratch_allocator_t::invalid_value};
4280+
{
4281+
const addr_t sizes[2] = {tlasToBuild.scratchSize,instanceSize*instanceCount};
4282+
const addr_t alignments[2] = {limits.minAccelerationStructureScratchOffsetAlignment,8}; // TODO: check address allocator can service these alignments
4283+
const size_t worstSize = core::alignUp(sizes[0],alignments[1])+sizes[1];
4284+
// it will never fit (prevent CPU hangs)
4285+
if (const auto& addrAlloc=params.scratchForDeviceASBuild->getAddressAllocator(); addrAlloc.get_free_size()+addrAlloc.get_allocated_size()<worstSize)
4286+
{
4287+
markFailureInStaging(as,pFoundHash);
4288+
continue;
4289+
}
4290+
// if fail then flush and keep trying till space is made
4291+
for (uint32_t t=0; params.scratchForDeviceASBuild->multi_allocate(2u,&offsets[0],&sizes[0],&alignments[0])!=0u; t++)
4292+
if (t==1) // don't flush right away cause allocator not defragmented yet
4293+
{
4294+
recordBuilds();
4295+
drainCompute();
4296+
}
4297+
params.scratchForDeviceASBuild->multi_deallocate(2,&offsets[0],&sizes[0],params.compute->getFutureScratchSemaphore());
4298+
}
4299+
// stream the instance/geometry input in && check dependents
4300+
// unfortunately can't count on large ReBAR heaps so we can't force the `scratchBuffer` to be mapped and writable
4301+
for (const auto& instance : instances)
4302+
{
4303+
instance.instance;
4304+
}
42744305
// prepare build infos
42754306
auto& buildInfo = buildInfos.emplace_back();
4276-
buildInfo.scratch = {};
4277-
// buildInfo.buildFlags = tlasToBuild.getBuildFlags();
4307+
buildInfo.scratch = {.offset=offsets[0],.buffer=smart_refctd_ptr<IGPUBuffer>(params.scratchForDeviceASBuild->getBuffer())};
4308+
buildInfo.buildFlags = tlasToBuild.getBuildFlags();
42784309
buildInfo.dstAS = as;
4279-
buildInfo.instanceData = {};
4310+
buildInfo.instanceData = {.offset=offsets[1],.buffer=smart_refctd_ptr<IGPUBuffer>(params.scratchForDeviceASBuild->getBuffer())};
42804311
// be based cause vectors can grow
42814312
{
42824313
const auto offset = trackedBLASes.size();
42834314
using p_p_BLAS_t = const IGPUBottomLevelAccelerationStructure**;
4284-
buildInfo.trackedBLASes = {reinterpret_cast<const p_p_BLAS_t&>(offset),instances.size()};
4315+
buildInfo.trackedBLASes = {reinterpret_cast<const p_p_BLAS_t&>(offset),instanceCount};
42854316
}
4286-
rangeInfos.emplace_back(instances.size(),0u);
4317+
rangeInfos.emplace_back(instanceCount,0u);
42874318
}
42884319
recordBuilds();
42894320
computeCmdBuf->cmdbuf->endDebugMarker();
@@ -4298,7 +4329,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
42984329
computeCmdBuf->cmdbuf->writeAccelerationStructureProperties(compactions,IQueryPool::TYPE::ACCELERATION_STRUCTURE_COMPACTED_SIZE,queryPool.get(),0)
42994330
)
43004331
{
4301-
// drain compute
4332+
// submit cause host needs to read the queries
4333+
drainCompute();
43024334
// get queries
43034335
core::vector<size_t> sizes(compactions.size());
43044336
if (device->getQueryPoolResults(

0 commit comments

Comments
 (0)