@@ -3567,6 +3567,12 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
3567
3567
logger.log (" Acceleration Structure Scratch Device Memory Allocator not large enough!" ,system::ILogger::ELL_ERROR);
3568
3568
return retval;
3569
3569
}
3570
+ const auto minScratchAlignment = device->getPhysicalDevice ()->getLimits ().minAccelerationStructureScratchOffsetAlignment ;
3571
+ if (addrAlloc.max_alignment ()<minScratchAlignment)
3572
+ {
3573
+ logger.log (" Accceleration Structure Scratch Device Memory Allocator cannot allocate with Physical Device's minimum required AS-build scratch alignment %u" ,system::ILogger::ELL_ERROR,minScratchAlignment);
3574
+ return retval;
3575
+ }
3570
3576
}
3571
3577
// the elusive and exotic host builds
3572
3578
if (reservations.willHostASBuild () && !params.scratchForHostASBuild )
@@ -3590,9 +3596,9 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
3590
3596
return const_cast <core::blake3_hash_t *>(&found->second .value );
3591
3597
};
3592
3598
// wipe gpu item in staging cache (this may drop it as well if it was made for only a root asset == no users)
3593
- auto markFailureInStaging = [logger](auto * gpuObj, core::blake3_hash_t * hash)->void
3599
+ auto markFailureInStaging = [logger](const char * message, auto * gpuObj, core::blake3_hash_t * hash)->void
3594
3600
{
3595
- logger.log (" Data upload failed for \" %s\" " ,system::ILogger::ELL_ERROR,gpuObj->getObjectDebugName ());
3601
+ logger.log (" %s failed for \" %s\" " ,system::ILogger::ELL_ERROR,message ,gpuObj->getObjectDebugName ());
3596
3602
// change the content hash on the reverse map to a NoContentHash
3597
3603
*hash = CHashCache::NoContentHash;
3598
3604
};
@@ -3677,7 +3683,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
3677
3683
item.canonical = nullptr ;
3678
3684
if (!success)
3679
3685
{
3680
- markFailureInStaging (buffer,pFoundHash);
3686
+ markFailureInStaging (" Data Upload " , buffer,pFoundHash);
3681
3687
continue ;
3682
3688
}
3683
3689
submitsNeeded |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
@@ -3877,7 +3883,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
3877
3883
}
3878
3884
if (!quickWriteDescriptor (SrcMipBinding,srcIx,std::move (srcView)))
3879
3885
{
3880
- markFailureInStaging (image,pFoundHash);
3886
+ markFailureInStaging (" Source Mip Level Descriptor Write " , image,pFoundHash);
3881
3887
continue ;
3882
3888
}
3883
3889
}
@@ -4185,7 +4191,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4185
4191
// failed in the for-loop
4186
4192
if (lvl != creationParams.mipLevels )
4187
4193
{
4188
- markFailureInStaging (image, pFoundHash);
4194
+ markFailureInStaging (" Compute Mip Mapping " , image,pFoundHash);
4189
4195
continue ;
4190
4196
}
4191
4197
}
@@ -4194,7 +4200,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4194
4200
{
4195
4201
if (!pipelineBarrier (xferCmdBuf,{.memBarriers ={},.bufBarriers ={},.imgBarriers =transferBarriers}," Final Pipeline Barrier recording to Transfer Command Buffer failed" ))
4196
4202
{
4197
- markFailureInStaging (image, pFoundHash);
4203
+ markFailureInStaging (" Image Data Upload Pipeline Barrier " , image,pFoundHash);
4198
4204
continue ;
4199
4205
}
4200
4206
submitsNeeded |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
@@ -4204,7 +4210,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4204
4210
dsAlloc->multi_deallocate (SrcMipBinding,1 ,&srcIx,params.compute ->getFutureScratchSemaphore ());
4205
4211
if (!pipelineBarrier (computeCmdBuf,{.memBarriers ={},.bufBarriers ={},.imgBarriers =computeBarriers}," Final Pipeline Barrier recording to Compute Command Buffer failed" ))
4206
4212
{
4207
- markFailureInStaging (image,pFoundHash);
4213
+ markFailureInStaging (" Compute Mip Mapping Pipeline Barrier " , image,pFoundHash);
4208
4214
continue ;
4209
4215
}
4210
4216
}
@@ -4281,7 +4287,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4281
4287
//
4282
4288
if (!device->buildAccelerationStructure (dOp.get (),info,range))
4283
4289
{
4284
- markFailureInStaging (gpuObj,pFoundHash);
4290
+ markFailureInStaging (" BLAS Build Command Recording " , gpuObj,pFoundHash);
4285
4291
continue ;
4286
4292
}
4287
4293
}
@@ -4332,7 +4338,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4332
4338
trackedBLASes.reserve (hlsl::max (tlasCount,blasCount));
4333
4339
core::vector<IGPUTopLevelAccelerationStructure::BuildRangeInfo> rangeInfos;
4334
4340
rangeInfos.reserve (tlasCount);
4335
- auto recordBuilds = [&]()->void
4341
+ auto recordBuildCommands = [&]()->void
4336
4342
{
4337
4343
if (buildInfos.empty ())
4338
4344
return ;
@@ -4347,7 +4353,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4347
4353
for (const auto & info : buildInfos)
4348
4354
{
4349
4355
const auto pFoundHash = findInStaging.operator ()<ICPUTopLevelAccelerationStructure>(info.dstAS );
4350
- markFailureInStaging (info.dstAS ,pFoundHash); // TODO: make messages configurable message
4356
+ markFailureInStaging (" TLAS Build Command Recording " , info.dstAS ,pFoundHash); // TODO: make messages configurable message
4351
4357
}
4352
4358
buildInfos.clear ();
4353
4359
rangeInfos.clear ();
@@ -4363,49 +4369,118 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4363
4369
const auto pFoundHash = findInStaging.operator ()<ICPUTopLevelAccelerationStructure>(as);
4364
4370
const auto instances = tlasToBuild.canonical ->getInstances ();
4365
4371
const auto instanceCount = static_cast <uint32_t >(instances.size ());
4366
- const auto instanceSize = true ? sizeof (IGPUTopLevelAccelerationStructure::DevicePolymorphicInstance): sizeof (IGPUTopLevelAccelerationStructure::DevicePolymorphicInstance) ;
4367
- // allocate scratch and build inputs
4368
- addr_t offsets[ 2 ] = { scratch_allocator_t ::invalid_value, scratch_allocator_t ::invalid_value};
4372
+ size_t instanceDataSize = 0 ;
4373
+ // gather total input size and check dependants exist
4374
+ for ( const auto & instance : instances)
4369
4375
{
4370
- const addr_t sizes[2 ] = {tlasToBuild.scratchSize ,instanceSize*instanceCount};
4371
- const addr_t alignments[2 ] = {limits.minAccelerationStructureScratchOffsetAlignment ,8 }; // TODO: check address allocator can service these alignments
4372
- const size_t worstSize = core::alignUp (sizes[0 ],alignments[1 ])+sizes[1 ];
4373
- // it will never fit (prevent CPU hangs)
4374
- if (const auto & addrAlloc=params.scratchForDeviceASBuild ->getAddressAllocator (); addrAlloc.get_free_size ()+addrAlloc.get_allocated_size ()<worstSize)
4376
+ if (reservations.m_blasBuildMap .find (instance.getBase ().blas .get ())==reservations.m_blasBuildMap .end ())
4375
4377
{
4376
- markFailureInStaging (as,pFoundHash);
4377
- continue ;
4378
+ instanceDataSize = 0 ;
4379
+ break ;
4380
+ }
4381
+ using instance_type_t = ICPUTopLevelAccelerationStructure::INSTANCE_TYPE;
4382
+ switch (instance.getType ())
4383
+ {
4384
+ case instance_type_t ::SRT_MOTION:
4385
+ instanceDataSize += sizeof (IGPUTopLevelAccelerationStructure::DeviceSRTMotionInstance);
4386
+ break ;
4387
+ case instance_type_t ::MATRIX_MOTION:
4388
+ instanceDataSize += sizeof (IGPUTopLevelAccelerationStructure::DeviceMatrixMotionInstance);
4389
+ break ;
4390
+ default :
4391
+ instanceDataSize += sizeof (IGPUTopLevelAccelerationStructure::DeviceStaticInstance);
4392
+ break ;
4378
4393
}
4394
+ }
4395
+ // problem with finding the dependents (BLASes)
4396
+ if (instanceDataSize==0 )
4397
+ {
4398
+ markFailureInStaging (" Finding Dependant GPU BLASes for TLAS build" ,as,pFoundHash);
4399
+ continue ;
4400
+ }
4401
+ // allocate scratch and build inputs
4402
+ constexpr uint32_t AllocCount = 3 ;
4403
+ addr_t offsets[3 ] = {scratch_allocator_t ::invalid_value,scratch_allocator_t ::invalid_value,scratch_allocator_t ::invalid_value};
4404
+ const addr_t sizes[AllocCount] = {tlasToBuild.scratchSize ,instanceDataSize,sizeof (void *)*instanceCount};
4405
+ {
4406
+ const addr_t alignments[AllocCount] = {limits.minAccelerationStructureScratchOffsetAlignment ,16 ,8 };
4407
+ /* TODO: move to reserve phase - prevent CPU hangs by making sure allocator big enough to service us
4408
+ {
4409
+ addr_t worstSize = sizes[0];
4410
+ for (auto i=1u; i<AllocCount; i++)
4411
+ worstSize = core::alignUp(worstSize,alignments[i])+sizes[i];
4412
+ if (worstSize>minScratchSize)
4413
+ minScratchSize = worstSize;
4414
+ }*/
4379
4415
// if fail then flush and keep trying till space is made
4380
- for (uint32_t t=0 ; params.scratchForDeviceASBuild ->multi_allocate (2u ,&offsets[0 ],&sizes[0 ],&alignments[0 ])!=0u ; t++)
4416
+ for (uint32_t t=0 ; params.scratchForDeviceASBuild ->multi_allocate (AllocCount ,&offsets[0 ],&sizes[0 ],&alignments[0 ])!=0u ; t++)
4381
4417
if (t==1 ) // don't flush right away cause allocator not defragmented yet
4382
4418
{
4383
- recordBuilds ();
4419
+ recordBuildCommands ();
4420
+ // TODO: make sure compute acquires ownership of geometry data for the build
4384
4421
drainCompute ();
4385
4422
}
4386
- params.scratchForDeviceASBuild ->multi_deallocate (2 ,&offsets[0 ],&sizes[0 ],params.compute ->getFutureScratchSemaphore ());
4423
+ // queue up a deferred allocation
4424
+ params.scratchForDeviceASBuild ->multi_deallocate (AllocCount,&offsets[0 ],&sizes[0 ],params.compute ->getFutureScratchSemaphore ());
4387
4425
}
4388
- // stream the instance/geometry input in && check dependents
4389
- // unfortunately can't count on large ReBAR heaps so we can't force the `scratchBuffer` to be mapped and writable
4390
- for ( const auto & instance : instances)
4426
+ // stream the instance/geometry input in
4427
+ // unfortunately can't count on large ReBAR heaps so we can't force the `scratchBuffer` to be mapped and writable
4428
+ SBufferRange<IGPUBuffer> range = {. offset =offsets[ 2 ],. size =sizes[ 2 ],. buffer =smart_refctd_ptr<IGPUBuffer>(params. scratchForDeviceASBuild -> getBuffer ())};
4391
4429
{
4392
- instance.instance ;
4430
+ // TODO: make sure the overflow submit work callback is doing some CPU work
4431
+ // TODO: write the callbacks
4432
+ struct FillInstancePointers
4433
+ {
4434
+ // uint32_t increaseAlignment(const uint32_t original) override {return sizeof(void*);}
4435
+
4436
+ size_t operator ()(void * dst, const size_t offsetInRange, const size_t blockSize)
4437
+ {
4438
+ assert (false );
4439
+ return 0ul ;
4440
+ }
4441
+ };
4442
+ FillInstancePointers fillInstancePointers;
4443
+ bool success = params.utilities ->updateBufferRangeViaStagingBuffer (*params.transfer ,range,fillInstancePointers);
4444
+ range.offset = offsets[1 ];
4445
+ range.size = sizes[1 ];
4446
+ struct FillInstances
4447
+ {
4448
+ // uint32_t increaseAlignment(const uint32_t original) override {return sizeof(void*);}
4449
+
4450
+ size_t operator ()(void * dst, const size_t offsetInRange, const size_t blockSize)
4451
+ {
4452
+ assert (false );
4453
+ return 0ul ;
4454
+ }
4455
+ };
4456
+ FillInstances fillInstances;
4457
+ params.utilities ->updateBufferRangeViaStagingBuffer (*params.transfer ,range,fillInstances);
4458
+ // TODO: pipeline barrier & ownership release between xfer and compute
4459
+ // current recording buffer may have changed
4460
+ xferCmdBuf = params.transfer ->getCommandBufferForRecording ();
4461
+ if (!success)
4462
+ {
4463
+ markFailureInStaging (" Uploading Instance Data for TLAS build failed" ,as,pFoundHash);
4464
+ continue ;
4465
+ }
4393
4466
}
4394
4467
// prepare build infos
4395
4468
auto & buildInfo = buildInfos.emplace_back ();
4396
- buildInfo.scratch = {.offset =offsets[ 0 ] ,.buffer =smart_refctd_ptr<IGPUBuffer>(params. scratchForDeviceASBuild -> getBuffer () )};
4469
+ buildInfo.scratch = {.offset =range. offset ,.buffer =smart_refctd_ptr (range. buffer )};
4397
4470
buildInfo.buildFlags = tlasToBuild.getBuildFlags ();
4398
4471
buildInfo.dstAS = as;
4399
- buildInfo.instanceData = {.offset =offsets[1 ],.buffer =smart_refctd_ptr<IGPUBuffer>(params.scratchForDeviceASBuild ->getBuffer ())};
4472
+ // note we don't build directly from staging, because only very small inputs could come from there and they'd impede the transfer efficiency of the larger ones
4473
+ buildInfo.instanceData = {.offset =offsets[1 ],.buffer =smart_refctd_ptr (range.buffer )};
4400
4474
// be based cause vectors can grow
4401
4475
{
4402
4476
const auto offset = trackedBLASes.size ();
4403
4477
using p_p_BLAS_t = const IGPUBottomLevelAccelerationStructure**;
4404
4478
buildInfo.trackedBLASes = {reinterpret_cast <const p_p_BLAS_t&>(offset),instanceCount};
4405
4479
}
4480
+ // no special extra byte offset into the instance buffer
4406
4481
rangeInfos.emplace_back (instanceCount,0u );
4407
4482
}
4408
- recordBuilds ();
4483
+ recordBuildCommands ();
4409
4484
computeCmdBuf->cmdbuf ->beginDebugMarker (" Asset Converter Compact TLASes END" );
4410
4485
computeCmdBuf->cmdbuf ->endDebugMarker ();
4411
4486
// no longer need this info
@@ -4666,18 +4741,19 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4666
4741
auto * pGpuObj = item.first ;
4667
4742
if (depsMissing)
4668
4743
{
4669
- const auto * hashAsU64 = reinterpret_cast <const uint64_t *>(item.second .value .data );
4670
- logger.log (" GPU Obj %s not writing to final cache because conversion of a dependant failed!" , system::ILogger::ELL_ERROR, pGpuObj->getObjectDebugName ());
4744
+ logger.log (" GPU Obj %s not writing to final cache because conversion of a dependant failed!" ,system::ILogger::ELL_ERROR,pGpuObj->getObjectDebugName ());
4671
4745
// wipe self, to let users know
4672
4746
item.second .value = {};
4673
4747
continue ;
4674
4748
}
4749
+ // The BLASes don't need to do this, because no-one checks for them as dependents and we can substitute the `item.first` in the staging cache right away
4675
4750
// For TLASes we need to write the compacted TLAS and not the intermediate build to the Cache
4676
4751
if constexpr (IsTLAS)
4677
4752
{
4678
4753
auto found = compactedTLASMap.find (pGpuObj);
4679
4754
if (found!=compactedTLASMap.end ())
4680
4755
pGpuObj = found->second .get ();
4756
+
4681
4757
}
4682
4758
// We have success now, but ask callback if we write to the new cache.
4683
4759
if (!params.writeCache (item.second )) // TODO: let the user know the pointer to the GPU Object too?
@@ -4691,10 +4767,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4691
4767
// again, need to go bottom up so we can check dependencies being successes
4692
4768
mergeCache.operator ()<ICPUBuffer>();
4693
4769
mergeCache.operator ()<ICPUImage>();
4694
- #ifdef NBL_ACCELERATION_STRUCTURE_CONVERSION
4695
4770
mergeCache.operator ()<ICPUBottomLevelAccelerationStructure>();
4696
4771
mergeCache.operator ()<ICPUTopLevelAccelerationStructure>();
4697
- #endif
4698
4772
mergeCache.operator ()<ICPUBufferView>();
4699
4773
mergeCache.operator ()<ICPUImageView>();
4700
4774
mergeCache.operator ()<ICPUShader>();
0 commit comments