@@ -3604,8 +3604,10 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
3604
3604
outputReverseMap[gpuObj.value .get ()] = i++;
3605
3605
}
3606
3606
);
3607
- auto markFailureInStaging = [&reservations,&outputReverseMap,logger]<Asset AssetType>(const char * message, const asset_traits<AssetType>::video_t * gpuObj, core::blake3_hash_t * hash)->void
3607
+ auto markFailureInStaging = [&reservations,&outputReverseMap,logger]<Asset AssetType>(const char * message, smart_refctd_ptr< const AssetType>& canonical, const asset_traits<AssetType>::video_t * gpuObj, core::blake3_hash_t * hash)->void
3608
3608
{
3609
+ // wipe the smart pointer to the canonical, make sure we release that memory ASAP if no other user is around
3610
+ canonical = nullptr ;
3609
3611
logger.log (" %s failed for \" %s\" " ,system::ILogger::ELL_ERROR,message,gpuObj->getObjectDebugName ());
3610
3612
// change the content hash on the reverse map to a NoContentHash
3611
3613
*hash = CHashCache::NoContentHash;
@@ -3695,13 +3697,13 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
3695
3697
success = success && params.utilities ->updateBufferRangeViaStagingBuffer (*params.transfer ,range,item.canonical ->getPointer ());
3696
3698
// current recording buffer may have changed
3697
3699
xferCmdBuf = params.transfer ->getCommandBufferForRecording ();
3698
- // let go of canonical asset (may free RAM)
3699
- item.canonical = nullptr ;
3700
3700
if (!success)
3701
3701
{
3702
- markFailureInStaging. operator ()<ICPUBuffer>( " Data Upload" ,buffer,pFoundHash);
3702
+ markFailureInStaging ( " Data Upload" ,item. canonical ,buffer,pFoundHash);
3703
3703
continue ;
3704
3704
}
3705
+ // let go of canonical asset (may free RAM)
3706
+ item.canonical = nullptr ;
3705
3707
submitsNeeded |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
3706
3708
// enqueue ownership release if necessary
3707
3709
if (ownerQueueFamily!=IQueue::FamilyIgnored)
@@ -3899,7 +3901,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
3899
3901
}
3900
3902
if (!quickWriteDescriptor (SrcMipBinding,srcIx,std::move (srcView)))
3901
3903
{
3902
- markFailureInStaging. operator ()<ICPUImage>( " Source Mip Level Descriptor Write" ,image,pFoundHash);
3904
+ markFailureInStaging ( " Source Mip Level Descriptor Write" ,item. canonical ,image,pFoundHash);
3903
3905
continue ;
3904
3906
}
3905
3907
}
@@ -4152,7 +4154,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4152
4154
const auto oldImmediateSubmitSignalValue = params.transfer ->scratchSemaphore .value ;
4153
4155
if (!params.utilities ->updateImageViaStagingBuffer (*params.transfer ,cpuImg->getBuffer ()->getPointer (),cpuImg->getCreationParameters ().format ,image,tmp.newLayout ,regions))
4154
4156
{
4155
- logger.log (" Image Redion Upload failed!" , system::ILogger::ELL_ERROR);
4157
+ logger.log (" Image Region Upload failed!" , system::ILogger::ELL_ERROR);
4156
4158
break ;
4157
4159
}
4158
4160
// stall callback is only called if multiple buffering of scratch commandbuffers fails, we also want to submit compute if transfer was submitted
@@ -4207,16 +4209,18 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4207
4209
// failed in the for-loop
4208
4210
if (lvl != creationParams.mipLevels )
4209
4211
{
4210
- markFailureInStaging. operator ()<ICPUImage>( " Compute Mip Mapping" ,image,pFoundHash);
4212
+ markFailureInStaging ( " Compute Mip Mapping" ,item. canonical ,image,pFoundHash);
4211
4213
continue ;
4212
4214
}
4215
+ // let go of canonical asset (may free RAM)
4216
+ item.canonical = nullptr ;
4213
4217
}
4214
4218
// here we only record barriers that do final layout transitions and release ownership to final queue family
4215
4219
if (!transferBarriers.empty ())
4216
4220
{
4217
4221
if (!pipelineBarrier (xferCmdBuf,{.memBarriers ={},.bufBarriers ={},.imgBarriers =transferBarriers}," Final Pipeline Barrier recording to Transfer Command Buffer failed" ))
4218
4222
{
4219
- markFailureInStaging. operator ()<ICPUImage>( " Image Data Upload Pipeline Barrier" ,image,pFoundHash);
4223
+ markFailureInStaging ( " Image Data Upload Pipeline Barrier" ,item. canonical ,image,pFoundHash);
4220
4224
continue ;
4221
4225
}
4222
4226
submitsNeeded |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
@@ -4226,7 +4230,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4226
4230
dsAlloc->multi_deallocate (SrcMipBinding,1 ,&srcIx,params.compute ->getFutureScratchSemaphore ());
4227
4231
if (!pipelineBarrier (computeCmdBuf,{.memBarriers ={},.bufBarriers ={},.imgBarriers =computeBarriers}," Final Pipeline Barrier recording to Compute Command Buffer failed" ))
4228
4232
{
4229
- markFailureInStaging. operator ()<ICPUImage>( " Compute Mip Mapping Pipeline Barrier" ,image,pFoundHash);
4233
+ markFailureInStaging ( " Compute Mip Mapping Pipeline Barrier" ,item. canonical ,image,pFoundHash);
4230
4234
continue ;
4231
4235
}
4232
4236
}
@@ -4244,8 +4248,6 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4244
4248
{
4245
4249
// we release BLAS and TLAS Storage Buffer ownership at the same time, because BLASes about to be released may need to be read by TLAS builds
4246
4250
core::vector<buffer_mem_barrier_t > ownershipTransfers;
4247
- // the already compacted BLASes need to be written into the TLASes using them
4248
- core::unordered_map<IGPUBottomLevelAccelerationStructure*,smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>> compactedBLASMap;
4249
4251
4250
4252
// Device Builds
4251
4253
auto & blasesToBuild = reservations.m_blasConversions [0 ];
@@ -4271,7 +4273,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4271
4273
// Device BLAS builds
4272
4274
if (blasCount)
4273
4275
{
4274
- compactedBLASMap. reserve (blasCount);
4276
+ // build
4275
4277
#ifdef NBL_ACCELERATION_STRUCTURE_CONVERSION
4276
4278
constexpr auto GeometryIsAABBFlag = ICPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT;
4277
4279
@@ -4303,7 +4305,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4303
4305
//
4304
4306
if (!device->buildAccelerationStructure (dOp.get (),info,range))
4305
4307
{
4306
- markFailureInStaging. operator ()<ICPUBottomLevelAccelerationStructure>( " BLAS Build Command Recording" ,gpuObj,pFoundHash);
4308
+ markFailureInStaging ( " BLAS Build Command Recording" ,item. canonical ,gpuObj,pFoundHash);
4307
4309
continue ;
4308
4310
}
4309
4311
}
@@ -4324,6 +4326,11 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4324
4326
}
4325
4327
}
4326
4328
#endif
4329
+ // compact
4330
+ {
4331
+ // the already compacted BLASes need to be written into the TLASes using them, want to swap them out ASAP
4332
+ // reservations.m_blasBuildMap[canonical].gpuBLAS = compacted;
4333
+ }
4327
4334
blasesToBuild.clear ();
4328
4335
}
4329
4336
@@ -4350,7 +4357,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4350
4357
//
4351
4358
core::vector<IGPUTopLevelAccelerationStructure::DeviceBuildInfo> buildInfos;
4352
4359
buildInfos.reserve (tlasCount);
4353
- core::vector<const IGPUBottomLevelAccelerationStructure* > trackedBLASes;
4360
+ core::vector<smart_refctd_ptr< const IGPUBottomLevelAccelerationStructure> > trackedBLASes;
4354
4361
trackedBLASes.reserve (hlsl::max (tlasCount,blasCount));
4355
4362
core::vector<IGPUTopLevelAccelerationStructure::BuildRangeInfo> rangeInfos;
4356
4363
rangeInfos.reserve (tlasCount);
@@ -4362,14 +4369,16 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4362
4369
for (auto & info : buildInfos)
4363
4370
{
4364
4371
const auto offset = info.trackedBLASes .data ();
4365
- info.trackedBLASes = {trackedBLASes.data ()+reinterpret_cast <const size_t &>(offset),info.trackedBLASes .size ()};
4372
+ const auto correctPtr = trackedBLASes.data ()+reinterpret_cast <const size_t &>(offset);
4373
+ info.trackedBLASes = {reinterpret_cast <const IGPUBottomLevelAccelerationStructure** const &>(correctPtr),info.trackedBLASes .size ()};
4366
4374
}
4367
4375
//
4368
4376
if (!computeCmdBuf->cmdbuf ->buildAccelerationStructures ({buildInfos},rangeInfos.data ()))
4369
4377
for (const auto & info : buildInfos)
4370
4378
{
4371
4379
const auto pFoundHash = findInStaging.operator ()<ICPUTopLevelAccelerationStructure>(info.dstAS );
4372
- markFailureInStaging.operator ()<ICPUTopLevelAccelerationStructure>(" TLAS Build Command Recording" ,info.dstAS ,pFoundHash); // TODO: make messages configurable message
4380
+ smart_refctd_ptr<const ICPUTopLevelAccelerationStructure> dummy; // already null at this point
4381
+ markFailureInStaging (" TLAS Build Command Recording" ,dummy,info.dstAS ,pFoundHash);
4373
4382
}
4374
4383
buildInfos.clear ();
4375
4384
rangeInfos.clear ();
@@ -4379,8 +4388,11 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4379
4388
using scratch_allocator_t = std::remove_reference_t <decltype (*params.scratchForDeviceASBuild )>;
4380
4389
using addr_t = typename scratch_allocator_t ::size_type;
4381
4390
const auto & limits = physDev->getLimits ();
4382
- for (const auto & tlasToBuild : tlasesToBuild)
4391
+ core::unordered_set<smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>> dedupBLASesUsed;
4392
+ dedupBLASesUsed.reserve (reservations.m_blasBuildMap .size ());
4393
+ for (auto & tlasToBuild : tlasesToBuild)
4383
4394
{
4395
+ dedupBLASesUsed.clear ();
4384
4396
const auto as = tlasToBuild.gpuObj ;
4385
4397
const auto pFoundHash = findInStaging.operator ()<ICPUTopLevelAccelerationStructure>(as);
4386
4398
const auto instances = tlasToBuild.canonical ->getInstances ();
@@ -4399,7 +4411,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4399
4411
// problem with finding the dependents (BLASes)
4400
4412
if (instanceDataSize==0 )
4401
4413
{
4402
- markFailureInStaging. operator ()<ICPUTopLevelAccelerationStructure>( " Finding Dependant GPU BLASes for TLAS build" ,as,pFoundHash);
4414
+ markFailureInStaging ( " Finding Dependant GPU BLASes for TLAS build" ,tlasToBuild. canonical ,as,pFoundHash);
4403
4415
continue ;
4404
4416
}
4405
4417
// allocate scratch and build inputs
@@ -4451,19 +4463,26 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4451
4463
const auto newWritten = bytesWritten+size;
4452
4464
if (newWritten>=blockSize)
4453
4465
return bytesWritten;
4454
- auto blas = blasBuildMap->find (instance.getBase ().blas .get ())->second ;
4466
+ auto found = blasBuildMap->find (instance.getBase ().blas .get ());
4467
+ assert (found!=blasBuildMap.end ());
4468
+ const auto & blas = found->second .gpuBLAS ;
4455
4469
dst = IGPUTopLevelAccelerationStructure::writeInstance (dst,instance,blas.get ()->getReferenceForDeviceOperations ());
4470
+ dedupBLASesUsed->emplace (blas);
4471
+ if (--found->second .remainingUsages == 0 )
4472
+ blasBuildMap->erase (found);
4456
4473
bytesWritten = newWritten;
4457
4474
}
4458
4475
}
4459
4476
4477
+ SReserveResult::cpu_to_gpu_blas_map_t * blasBuildMap;
4478
+ core::unordered_set<smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>>* dedupBLASesUsed;
4460
4479
std::span<const ICPUTopLevelAccelerationStructure::PolymorphicInstance> instances;
4461
- const SReserveResult::cpu_to_gpu_blas_map_t * blasBuildMap;
4462
4480
uint32_t instanceIndex = 0 ;
4463
4481
};
4464
4482
FillInstances fillInstances;
4465
- fillInstances.instances = instances;
4466
4483
fillInstances.blasBuildMap = &reservations.m_blasBuildMap ;
4484
+ fillInstances.dedupBLASesUsed = &dedupBLASesUsed;
4485
+ fillInstances.instances = instances;
4467
4486
success = success && params.utilities ->updateBufferRangeViaStagingBuffer (*params.transfer ,range,fillInstances);
4468
4487
}
4469
4488
if (as->usesMotion ())
@@ -4501,9 +4520,11 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4501
4520
xferCmdBuf = params.transfer ->getCommandBufferForRecording ();
4502
4521
if (!success)
4503
4522
{
4504
- markFailureInStaging. operator ()<ICPUTopLevelAccelerationStructure>( " Uploading Instance Data for TLAS build failed" ,as,pFoundHash);
4523
+ markFailureInStaging ( " Uploading Instance Data for TLAS build failed" ,tlasToBuild. canonical ,as,pFoundHash);
4505
4524
continue ;
4506
4525
}
4526
+ // let go of canonical asset (may free RAM)
4527
+ tlasToBuild.canonical = nullptr ;
4507
4528
}
4508
4529
// prepare build infos
4509
4530
auto & buildInfo = buildInfos.emplace_back ();
@@ -4517,16 +4538,18 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4517
4538
{
4518
4539
const auto offset = trackedBLASes.size ();
4519
4540
using p_p_BLAS_t = const IGPUBottomLevelAccelerationStructure**;
4520
- buildInfo.trackedBLASes = {reinterpret_cast <const p_p_BLAS_t&>(offset),instanceCount};
4541
+ buildInfo.trackedBLASes = {reinterpret_cast <const p_p_BLAS_t&>(offset),dedupBLASesUsed.size ()};
4542
+ for (auto & blas : dedupBLASesUsed)
4543
+ trackedBLASes.emplace_back (std::move (blas));
4544
+
4521
4545
}
4522
4546
// no special extra byte offset into the instance buffer
4523
4547
rangeInfos.emplace_back (instanceCount,0u );
4524
4548
}
4549
+ // finish the last batch
4525
4550
recordBuildCommands ();
4526
4551
computeCmdBuf->cmdbuf ->beginDebugMarker (" Asset Converter Compact TLASes END" );
4527
4552
computeCmdBuf->cmdbuf ->endDebugMarker ();
4528
- // no longer need this info
4529
- compactedBLASMap.clear ();
4530
4553
}
4531
4554
// compact
4532
4555
computeCmdBuf->cmdbuf ->beginDebugMarker (" Asset Converter Compact TLASes START" );
0 commit comments