@@ -4245,14 +4245,16 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4245
4245
rangeInfos.reserve (tlasCount);
4246
4246
auto recordBuilds = [&]()->void
4247
4247
{
4248
+ if (buildInfos.empty ())
4249
+ return ;
4248
4250
// rewrite the trackedBLASes pointers
4249
4251
for (auto & info : buildInfos)
4250
4252
{
4251
4253
const auto offset = info.trackedBLASes .data ();
4252
4254
info.trackedBLASes = {trackedBLASes.data ()+reinterpret_cast <const size_t &>(offset),info.trackedBLASes .size ()};
4253
4255
}
4254
4256
//
4255
- if (!buildInfos. empty () && ! computeCmdBuf->cmdbuf ->buildAccelerationStructures ({buildInfos},rangeInfos.data ()))
4257
+ if (!computeCmdBuf->cmdbuf ->buildAccelerationStructures ({buildInfos},rangeInfos.data ()))
4256
4258
for (const auto & info : buildInfos)
4257
4259
{
4258
4260
const auto pFoundHash = findInStaging.operator ()<ICPUTopLevelAccelerationStructure>(info.dstAS );
@@ -4263,27 +4265,56 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4263
4265
trackedBLASes.clear ();
4264
4266
};
4265
4267
//
4268
+ using scratch_allocator_t = std::remove_reference_t <decltype (*params.scratchForDeviceASBuild )>;
4269
+ using addr_t = typename scratch_allocator_t ::size_type;
4270
+ const auto & limits = device->getPhysicalDevice ()->getLimits ();
4266
4271
for (const auto & tlasToBuild : tlasesToBuild)
4267
4272
{
4268
4273
const auto as = tlasToBuild.gpuObj ;
4269
4274
const auto pFoundHash = findInStaging.operator ()<ICPUTopLevelAccelerationStructure>(as);
4270
4275
const auto instances = tlasToBuild.canonical ->getInstances ();
4276
+ const auto instanceCount = static_cast <uint32_t >(instances.size ());
4277
+ const auto instanceSize = true ? sizeof (IGPUTopLevelAccelerationStructure::DevicePolymorphicInstance):sizeof (IGPUTopLevelAccelerationStructure::DevicePolymorphicInstance);
4271
4278
// allocate scratch and build inputs
4272
- // if fail then flush
4273
- // stream the info in && check dependents
4279
+ addr_t offsets[2 ] = {scratch_allocator_t ::invalid_value,scratch_allocator_t ::invalid_value};
4280
+ {
4281
+ const addr_t sizes[2 ] = {tlasToBuild.scratchSize ,instanceSize*instanceCount};
4282
+ const addr_t alignments[2 ] = {limits.minAccelerationStructureScratchOffsetAlignment ,8 }; // TODO: check address allocator can service these alignments
4283
+ const size_t worstSize = core::alignUp (sizes[0 ],alignments[1 ])+sizes[1 ];
4284
+ // it will never fit (prevent CPU hangs)
4285
+ if (const auto & addrAlloc=params.scratchForDeviceASBuild ->getAddressAllocator (); addrAlloc.get_free_size ()+addrAlloc.get_allocated_size ()<worstSize)
4286
+ {
4287
+ markFailureInStaging (as,pFoundHash);
4288
+ continue ;
4289
+ }
4290
+ // if fail then flush and keep trying till space is made
4291
+ for (uint32_t t=0 ; params.scratchForDeviceASBuild ->multi_allocate (2u ,&offsets[0 ],&sizes[0 ],&alignments[0 ])!=0u ; t++)
4292
+ if (t==1 ) // don't flush right away cause allocator not defragmented yet
4293
+ {
4294
+ recordBuilds ();
4295
+ drainCompute ();
4296
+ }
4297
+ params.scratchForDeviceASBuild ->multi_deallocate (2 ,&offsets[0 ],&sizes[0 ],params.compute ->getFutureScratchSemaphore ());
4298
+ }
4299
+ // stream the instance/geometry input in && check dependents
4300
+ // unfortunately can't count on large ReBAR heaps so we can't force the `scratchBuffer` to be mapped and writable
4301
+ for (const auto & instance : instances)
4302
+ {
4303
+ instance.instance ;
4304
+ }
4274
4305
// prepare build infos
4275
4306
auto & buildInfo = buildInfos.emplace_back ();
4276
- buildInfo.scratch = {};
4277
- // buildInfo.buildFlags = tlasToBuild.getBuildFlags();
4307
+ buildInfo.scratch = {. offset =offsets[ 0 ],. buffer =smart_refctd_ptr<IGPUBuffer>(params. scratchForDeviceASBuild -> getBuffer ()) };
4308
+ buildInfo.buildFlags = tlasToBuild.getBuildFlags ();
4278
4309
buildInfo.dstAS = as;
4279
- buildInfo.instanceData = {};
4310
+ buildInfo.instanceData = {. offset =offsets[ 1 ],. buffer =smart_refctd_ptr<IGPUBuffer>(params. scratchForDeviceASBuild -> getBuffer ()) };
4280
4311
// be based cause vectors can grow
4281
4312
{
4282
4313
const auto offset = trackedBLASes.size ();
4283
4314
using p_p_BLAS_t = const IGPUBottomLevelAccelerationStructure**;
4284
- buildInfo.trackedBLASes = {reinterpret_cast <const p_p_BLAS_t&>(offset),instances. size () };
4315
+ buildInfo.trackedBLASes = {reinterpret_cast <const p_p_BLAS_t&>(offset),instanceCount };
4285
4316
}
4286
- rangeInfos.emplace_back (instances. size () ,0u );
4317
+ rangeInfos.emplace_back (instanceCount ,0u );
4287
4318
}
4288
4319
recordBuilds ();
4289
4320
computeCmdBuf->cmdbuf ->endDebugMarker ();
@@ -4298,7 +4329,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4298
4329
computeCmdBuf->cmdbuf ->writeAccelerationStructureProperties (compactions,IQueryPool::TYPE::ACCELERATION_STRUCTURE_COMPACTED_SIZE,queryPool.get (),0 )
4299
4330
)
4300
4331
{
4301
- // drain compute
4332
+ // submit cause host needs to read the queries
4333
+ drainCompute ();
4302
4334
// get queries
4303
4335
core::vector<size_t > sizes (compactions.size ());
4304
4336
if (device->getQueryPoolResults (
0 commit comments