@@ -4378,19 +4378,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4378
4378
instanceDataSize = 0 ;
4379
4379
break ;
4380
4380
}
4381
- using instance_type_t = ICPUTopLevelAccelerationStructure::INSTANCE_TYPE;
4382
- switch (instance.getType ())
4383
- {
4384
- case instance_type_t ::SRT_MOTION:
4385
- instanceDataSize += sizeof (IGPUTopLevelAccelerationStructure::DeviceSRTMotionInstance);
4386
- break ;
4387
- case instance_type_t ::MATRIX_MOTION:
4388
- instanceDataSize += sizeof (IGPUTopLevelAccelerationStructure::DeviceMatrixMotionInstance);
4389
- break ;
4390
- default :
4391
- instanceDataSize += sizeof (IGPUTopLevelAccelerationStructure::DeviceStaticInstance);
4392
- break ;
4393
- }
4381
+ instanceDataSize += ITopLevelAccelerationStructure::getInstanceSize (instance.getType ());
4394
4382
}
4395
4383
// problem with finding the dependents (BLASes)
4396
4384
if (instanceDataSize==0 )
@@ -4399,11 +4387,11 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4399
4387
continue ;
4400
4388
}
4401
4389
// allocate scratch and build inputs
4402
- constexpr uint32_t AllocCount = 3 ;
4403
- addr_t offsets[3 ] = {scratch_allocator_t ::invalid_value,scratch_allocator_t ::invalid_value,scratch_allocator_t ::invalid_value};
4404
- const addr_t sizes[AllocCount ] = {tlasToBuild.scratchSize ,instanceDataSize,sizeof (void *)*instanceCount};
4390
+ constexpr uint32_t MaxAllocCount = 3 ;
4391
+ addr_t offsets[MaxAllocCount ] = {scratch_allocator_t ::invalid_value,scratch_allocator_t ::invalid_value,scratch_allocator_t ::invalid_value};
4392
+ const addr_t sizes[MaxAllocCount ] = {tlasToBuild.scratchSize ,instanceDataSize,sizeof (void *)*instanceCount};
4405
4393
{
4406
- const addr_t alignments[AllocCount ] = {limits.minAccelerationStructureScratchOffsetAlignment ,16 ,8 };
4394
+ const addr_t alignments[MaxAllocCount ] = {limits.minAccelerationStructureScratchOffsetAlignment ,16 ,8 };
4407
4395
/* TODO: move to reserve phase - prevent CPU hangs by making sure allocator big enough to service us
4408
4396
{
4409
4397
addr_t worstSize = sizes[0];
@@ -4412,6 +4400,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4412
4400
if (worstSize>minScratchSize)
4413
4401
minScratchSize = worstSize;
4414
4402
}*/
4403
+ const auto AllocCount = as->usesMotion () ? 2 :3 ;
4415
4404
// if fail then flush and keep trying till space is made
4416
4405
for (uint32_t t=0 ; params.scratchForDeviceASBuild ->multi_allocate (AllocCount,&offsets[0 ],&sizes[0 ],&alignments[0 ])!=0u ; t++)
4417
4406
if (t==1 ) // don't flush right away cause allocator not defragmented yet
@@ -4425,36 +4414,72 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4425
4414
}
4426
4415
// stream the instance/geometry input in
4427
4416
// unfortunately can't count on large ReBAR heaps so we can't force the `scratchBuffer` to be mapped and writable
4428
- SBufferRange<IGPUBuffer> range = {.offset =offsets[2 ],.size =sizes[2 ],.buffer =smart_refctd_ptr<IGPUBuffer>(params.scratchForDeviceASBuild ->getBuffer ())};
4417
+ SBufferRange<IGPUBuffer> range = {.offset =offsets[1 ],.size =sizes[1 ],.buffer =smart_refctd_ptr<IGPUBuffer>(params.scratchForDeviceASBuild ->getBuffer ())};
4429
4418
{
4419
+ bool success = true ;
4430
4420
// TODO: make sure the overflow submit work callback is doing some CPU work
4431
- // TODO: write the callbacks
4432
- struct FillInstancePointers
4433
4421
{
4434
- // uint32_t increaseAlignment(const uint32_t original) override {return sizeof(void*);}
4435
-
4436
- size_t operator ()(void * dst, const size_t offsetInRange, const size_t blockSize)
4422
+ struct FillInstances : IUtilities::IUpstreamingDataProducer
4437
4423
{
4438
- assert (false );
4439
- return 0ul ;
4440
- }
4441
- };
4442
- FillInstancePointers fillInstancePointers;
4443
- bool success = params.utilities ->updateBufferRangeViaStagingBuffer (*params.transfer ,range,fillInstancePointers);
4444
- range.offset = offsets[1 ];
4445
- range.size = sizes[1 ];
4446
- struct FillInstances
4447
- {
4448
- // uint32_t increaseAlignment(const uint32_t original) override {return sizeof(void*);}
4424
+ uint32_t operator ()(void * dst, const size_t offsetInRange, const uint32_t blockSize) override
4425
+ {
4426
+ using blas_ref_t = IGPUBottomLevelAccelerationStructure::device_op_ref_t ;
4427
+ assert (offsetInRange%16 ==0 );
4428
+
4429
+ uint32_t bytesWritten = 0 ;
4430
+ while (true )
4431
+ {
4432
+ const auto & instance = instances[instanceIndex++];
4433
+ const auto type = instance.getType ();
4434
+ const auto size = ITopLevelAccelerationStructure::getInstanceSize (type);
4435
+ const auto newWritten = bytesWritten+size;
4436
+ if (newWritten>=blockSize)
4437
+ return bytesWritten;
4438
+ auto blas = blasBuildMap->find (instance.getBase ().blas .get ())->second ;
4439
+ dst = IGPUTopLevelAccelerationStructure::writeInstance (dst,instance,blas.get ()->getReferenceForDeviceOperations ());
4440
+ bytesWritten = newWritten;
4441
+ }
4442
+ }
4449
4443
4450
- size_t operator ()(void * dst, const size_t offsetInRange, const size_t blockSize)
4444
+ std::span<const ICPUTopLevelAccelerationStructure::PolymorphicInstance> instances;
4445
+ const SReserveResult::cpu_to_gpu_blas_map_t * blasBuildMap;
4446
+ uint32_t instanceIndex = 0 ;
4447
+ };
4448
+ FillInstances fillInstances;
4449
+ fillInstances.instances = instances;
4450
+ fillInstances.blasBuildMap = &reservations.m_blasBuildMap ;
4451
+ success = success && params.utilities ->updateBufferRangeViaStagingBuffer (*params.transfer ,range,fillInstances);
4452
+ }
4453
+ if (as->usesMotion ())
4454
+ {
4455
+ range.offset = offsets[2 ];
4456
+ range.size = sizes[2 ];
4457
+ struct FillInstancePointers : IUtilities::IUpstreamingDataProducer
4451
4458
{
4452
- assert (false );
4453
- return 0ul ;
4454
- }
4455
- };
4456
- FillInstances fillInstances;
4457
- params.utilities ->updateBufferRangeViaStagingBuffer (*params.transfer ,range,fillInstances);
4459
+ uint32_t operator ()(void * dst, const size_t offsetInRange, const uint32_t blockSize) override
4460
+ {
4461
+ constexpr uint32_t ptr_sz = sizeof (uint64_t );
4462
+
4463
+ const uint32_t count = blockSize/ptr_sz;
4464
+ assert (offsetInRange%ptr_sz==0 );
4465
+ const uint32_t baseInstance = static_cast <uint32_t >(offsetInRange)/ptr_sz;
4466
+ for (uint32_t i=0 ; i<count; i++)
4467
+ {
4468
+ const auto type = instances[baseInstance+i].getType ();
4469
+ reinterpret_cast <uint64_t *>(dst)[i] = IGPUTopLevelAccelerationStructure::encodeTypeInAddress (type,instanceAddress);
4470
+ instanceAddress += ITopLevelAccelerationStructure::getInstanceSize (type);
4471
+ }
4472
+ return count*ptr_sz;
4473
+ }
4474
+
4475
+ std::span<const ICPUTopLevelAccelerationStructure::PolymorphicInstance> instances;
4476
+ uint64_t instanceAddress;
4477
+ };
4478
+ FillInstancePointers fillInstancePointers;
4479
+ fillInstancePointers.instances = instances;
4480
+ fillInstancePointers.instanceAddress = range.buffer ->getDeviceAddress ()+offsets[1 ];
4481
+ success = success && params.utilities ->updateBufferRangeViaStagingBuffer (*params.transfer ,range,fillInstancePointers);
4482
+ }
4458
4483
// TODO: pipeline barrier & ownership release between xfer and compute
4459
4484
// current recording buffer may have changed
4460
4485
xferCmdBuf = params.transfer ->getCommandBufferForRecording ();
@@ -4466,11 +4491,12 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4466
4491
}
4467
4492
// prepare build infos
4468
4493
auto & buildInfo = buildInfos.emplace_back ();
4469
- buildInfo.scratch = {.offset =range. offset ,.buffer = smart_refctd_ptr (range.buffer )};
4494
+ buildInfo.scratch = {.offset =offsets[ 0 ] ,.buffer = smart_refctd_ptr (range.buffer )};
4470
4495
buildInfo.buildFlags = tlasToBuild.getBuildFlags ();
4496
+ buildInfo.instanceDataTypeEncodedInPointersLSB = as->usesMotion ();
4471
4497
buildInfo.dstAS = as;
4472
4498
// note we don't build directly from staging, because only very small inputs could come from there and they'd impede the transfer efficiency of the larger ones
4473
- buildInfo.instanceData = {.offset =offsets[1 ],.buffer =smart_refctd_ptr (range.buffer )};
4499
+ buildInfo.instanceData = {.offset =offsets[as-> usesMotion () ? 2 : 1 ],.buffer =smart_refctd_ptr (range.buffer )};
4474
4500
// be based cause vectors can grow
4475
4501
{
4476
4502
const auto offset = trackedBLASes.size ();
0 commit comments