Skip to content

Commit b3b2987

Browse files
author
devsh
committed
add IGPUTopLevelAccelerationStructure::convertInstance utility method and finish the Instance Data streaming in the Asset Converter
Also only use `instanceDataTypeEncodedInPointersLSB` when needed.
1 parent 7c35d7e commit b3b2987

File tree

4 files changed

+152
-64
lines changed

4 files changed

+152
-64
lines changed

include/nbl/asset/IAccelerationStructure.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,9 +183,10 @@ class ITopLevelAccelerationStructure : public IDescriptor, public IAccelerationS
183183
FORCE_DISABLE_OPACITY_MICROMAPS_BIT = 0x1u<<5u,
184184
};
185185
// Note: `core::matrix3x4SIMD` is equvalent to VkTransformMatrixKHR, 4x3 row_major matrix
186-
template<typename blas_ref_t>
186+
template<typename _blas_ref_t>
187187
struct Instance final
188188
{
189+
using blas_ref_t = _blas_ref_t;
189190
static_assert(sizeof(blas_ref_t)==8 && alignof(blas_ref_t)==8);
190191
static_assert(std::is_same_v<core::smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>,blas_ref_t> || std::is_standard_layout_v<blas_ref_t>);
191192

include/nbl/video/IGPUAccelerationStructure.h

Lines changed: 79 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -497,34 +497,75 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
497497
using DeviceBuildInfo = BuildInfo<IGPUBuffer>;
498498
using HostBuildInfo = BuildInfo<asset::ICPUBuffer>;
499499

500-
static inline auto encodeTypeInRef(const INSTANCE_TYPE type, IGPUBottomLevelAccelerationStructure::device_op_ref_t ref)
500+
template<typename blas_ref_t>
501+
static inline Instance<blas_ref_t> convertInstance(const asset::ICPUTopLevelAccelerationStructure::Instance& instance, const blas_ref_t blasRef)
501502
{
502-
// aligned to 16 bytes as per the spec
503-
assert(ref.deviceAddress%16==0);
504-
switch (type)
505-
{
506-
case INSTANCE_TYPE::SRT_MOTION:
507-
ref.deviceAddress += 2;
508-
break;
509-
case INSTANCE_TYPE::MATRIX_MOTION:
510-
ref.deviceAddress += 1;
511-
break;
512-
default:
513-
break;
514-
}
515-
return ref;
503+
Instance<blas_ref_t> retval = {
504+
.instanceCustomIndex = instance.instanceCustomIndex,
505+
.mask = instance.mask,
506+
.instanceShaderBindingTableRecordOffset = instance.instanceShaderBindingTableRecordOffset,
507+
.flags = instance.flags,
508+
.blas = blasRef
509+
};
510+
return retval;
511+
}
512+
template<typename blas_ref_t>
513+
static inline Instance<blas_ref_t> convertInstance(const asset::ICPUTopLevelAccelerationStructure::Instance& instance, const IGPUBottomLevelAccelerationStructure* gpuBLAS)
514+
{
515+
assert(gpuBLAS);
516+
if constexpr (std::is_same_v<blas_ref_t,IGPUBottomLevelAccelerationStructure::host_op_ref_t>)
517+
return convertInstance<blas_ref_t>(instance,gpuBLAS->getReferenceForHostOperations());
518+
else
519+
return convertInstance<blas_ref_t>(instance,gpuBLAS->getReferenceForDeviceOperations());
520+
}
521+
template<typename blas_ref_t, typename BLASRefOrPtr>
522+
static inline StaticInstance<blas_ref_t> convertInstance(const asset::ICPUTopLevelAccelerationStructure::StaticInstance& instance, const BLASRefOrPtr gpuBLAS)
523+
{
524+
return {.transform=instance.transform,.base=convertInstance<blas_ref_t>(instance.base,gpuBLAS)};
516525
}
517-
static inline auto encodeTypeInRef(const INSTANCE_TYPE type, IGPUBottomLevelAccelerationStructure::host_op_ref_t ref)
526+
template<typename blas_ref_t, typename BLASRefOrPtr>
527+
static inline MatrixMotionInstance<blas_ref_t> convertInstance(const asset::ICPUTopLevelAccelerationStructure::MatrixMotionInstance& instance, const BLASRefOrPtr gpuBLAS)
528+
{
529+
MatrixMotionInstance<blas_ref_t> retval;
530+
std::copy_n(instance.transform,2,retval.transform);
531+
retval.base = convertInstance<blas_ref_t>(instance.base,gpuBLAS);
532+
return retval;
533+
}
534+
template<typename blas_ref_t, typename BLASRefOrPtr>
535+
static inline SRTMotionInstance<blas_ref_t> convertInstance(const asset::ICPUTopLevelAccelerationStructure::SRTMotionInstance& instance, const BLASRefOrPtr gpuBLAS)
536+
{
537+
SRTMotionInstance<blas_ref_t> retval;
538+
std::copy_n(instance.transform,2,retval.transform);
539+
retval.base = convertInstance<blas_ref_t>(instance.base,gpuBLAS);
540+
return retval;
541+
}
542+
543+
// returns the pointer to one byte past the address written
544+
template<typename blas_ref_t>
545+
static inline uint8_t* writeInstance(void* dst, const asset::ICPUTopLevelAccelerationStructure::PolymorphicInstance& instance, const blas_ref_t blasRef)
546+
{
547+
const uint32_t size = std::visit([&](auto& typedInstance)->size_t
548+
{
549+
const auto gpuInstance = IGPUTopLevelAccelerationStructure::convertInstance<blas_ref_t,blas_ref_t>(typedInstance,blasRef);
550+
memcpy(dst,&gpuInstance,sizeof(gpuInstance));
551+
return sizeof(gpuInstance);
552+
},
553+
instance.instance
554+
);
555+
return reinterpret_cast<uint8_t*>(dst)+size;
556+
}
557+
// for when you use an array of pointers to instance structs during a build
558+
static inline auto encodeTypeInAddress(const INSTANCE_TYPE type, uint64_t ref)
518559
{
519560
// aligned to 16 bytes as per the spec
520-
assert(ref.apiHandle%16==0);
561+
assert(ref%16==0);
521562
switch (type)
522563
{
523564
case INSTANCE_TYPE::SRT_MOTION:
524-
ref.apiHandle += 2;
565+
ref += 2;
525566
break;
526567
case INSTANCE_TYPE::MATRIX_MOTION:
527-
ref.apiHandle += 1;
568+
ref += 1;
528569
break;
529570
default:
530571
break;
@@ -601,6 +642,25 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
601642
using DevicePolymorphicInstance = PolymorphicInstance<IGPUBottomLevelAccelerationStructure::device_op_ref_t>;
602643
using HostPolymorphicInstance = PolymorphicInstance<IGPUBottomLevelAccelerationStructure::host_op_ref_t>;
603644
static_assert(sizeof(DevicePolymorphicInstance)==sizeof(HostPolymorphicInstance));
645+
646+
template<typename blas_ref_t, typename BLASRefOrPtr>
647+
static inline PolymorphicInstance<blas_ref_t> convertInstance(const asset::ICPUTopLevelAccelerationStructure::PolymorphicInstance& instance, const BLASRefOrPtr gpuBLAS)
648+
{
649+
PolymorphicInstance<blas_ref_t> retval;
650+
switch (instance.getType())
651+
{
652+
case INSTANCE_TYPE::SRT_MOTION:
653+
retval = convertInstance(std::get<IGPUTopLevelAccelerationStructure::SRTMotionInstance>(instance.instance),gpuBLAS);
654+
break;
655+
case INSTANCE_TYPE::MATRIX_MOTION:
656+
retval = convertInstance(std::get<IGPUTopLevelAccelerationStructure::MatrixMotionInstance>(instance.instance),gpuBLAS);
657+
break;
658+
default:
659+
retval = convertInstance(std::get<IGPUTopLevelAccelerationStructure::StaticInstance>(instance.instance),gpuBLAS);
660+
break;
661+
}
662+
return retval;
663+
}
604664

605665
//
606666
using build_ver_t = uint32_t;

include/nbl/video/utilities/CAssetConverter.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1094,7 +1094,8 @@ class CAssetConverter : public core::IReferenceCounted
10941094
// We do all compactions on the Device for simplicity
10951095
uint8_t m_willCompactSomeAS : 1 = false;
10961096
// This tracks non-root BLASes which are needed for a subsequent TLAS build. Note that even things which are NOT in the staging cache are tracked here to make sure they don't finish their lifetimes early.
1097-
core::unordered_map<const asset::ICPUBottomLevelAccelerationStructure*,asset_cached_t<asset::ICPUBottomLevelAccelerationStructure>> m_blasBuildMap;
1097+
using cpu_to_gpu_blas_map_t = core::unordered_map<const asset::ICPUBottomLevelAccelerationStructure*,asset_cached_t<asset::ICPUBottomLevelAccelerationStructure>>;
1098+
cpu_to_gpu_blas_map_t m_blasBuildMap;
10981099

10991100
//
11001101
core::bitflag<IQueue::FAMILY_FLAGS> m_queueFlags = IQueue::FAMILY_FLAGS::NONE;

src/nbl/video/utilities/CAssetConverter.cpp

Lines changed: 69 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -4378,19 +4378,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
43784378
instanceDataSize = 0;
43794379
break;
43804380
}
4381-
using instance_type_t = ICPUTopLevelAccelerationStructure::INSTANCE_TYPE;
4382-
switch (instance.getType())
4383-
{
4384-
case instance_type_t::SRT_MOTION:
4385-
instanceDataSize += sizeof(IGPUTopLevelAccelerationStructure::DeviceSRTMotionInstance);
4386-
break;
4387-
case instance_type_t::MATRIX_MOTION:
4388-
instanceDataSize += sizeof(IGPUTopLevelAccelerationStructure::DeviceMatrixMotionInstance);
4389-
break;
4390-
default:
4391-
instanceDataSize += sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance);
4392-
break;
4393-
}
4381+
instanceDataSize += ITopLevelAccelerationStructure::getInstanceSize(instance.getType());
43944382
}
43954383
// problem with finding the dependents (BLASes)
43964384
if (instanceDataSize==0)
@@ -4399,11 +4387,11 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
43994387
continue;
44004388
}
44014389
// allocate scratch and build inputs
4402-
constexpr uint32_t AllocCount = 3;
4403-
addr_t offsets[3] = {scratch_allocator_t::invalid_value,scratch_allocator_t::invalid_value,scratch_allocator_t::invalid_value};
4404-
const addr_t sizes[AllocCount] = {tlasToBuild.scratchSize,instanceDataSize,sizeof(void*)*instanceCount};
4390+
constexpr uint32_t MaxAllocCount = 3;
4391+
addr_t offsets[MaxAllocCount] = {scratch_allocator_t::invalid_value,scratch_allocator_t::invalid_value,scratch_allocator_t::invalid_value};
4392+
const addr_t sizes[MaxAllocCount] = {tlasToBuild.scratchSize,instanceDataSize,sizeof(void*)*instanceCount};
44054393
{
4406-
const addr_t alignments[AllocCount] = {limits.minAccelerationStructureScratchOffsetAlignment,16,8};
4394+
const addr_t alignments[MaxAllocCount] = {limits.minAccelerationStructureScratchOffsetAlignment,16,8};
44074395
/* TODO: move to reserve phase - prevent CPU hangs by making sure allocator big enough to service us
44084396
{
44094397
addr_t worstSize = sizes[0];
@@ -4412,6 +4400,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
44124400
if (worstSize>minScratchSize)
44134401
minScratchSize = worstSize;
44144402
}*/
4403+
const auto AllocCount = as->usesMotion() ? 2:3;
44154404
// if fail then flush and keep trying till space is made
44164405
for (uint32_t t=0; params.scratchForDeviceASBuild->multi_allocate(AllocCount,&offsets[0],&sizes[0],&alignments[0])!=0u; t++)
44174406
if (t==1) // don't flush right away cause allocator not defragmented yet
@@ -4425,36 +4414,72 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
44254414
}
44264415
// stream the instance/geometry input in
44274416
// unfortunately can't count on large ReBAR heaps so we can't force the `scratchBuffer` to be mapped and writable
4428-
SBufferRange<IGPUBuffer> range = {.offset=offsets[2],.size=sizes[2],.buffer=smart_refctd_ptr<IGPUBuffer>(params.scratchForDeviceASBuild->getBuffer())};
4417+
SBufferRange<IGPUBuffer> range = {.offset=offsets[1],.size=sizes[1],.buffer=smart_refctd_ptr<IGPUBuffer>(params.scratchForDeviceASBuild->getBuffer())};
44294418
{
4419+
bool success = true;
44304420
// TODO: make sure the overflow submit work callback is doing some CPU work
4431-
// TODO: write the callbacks
4432-
struct FillInstancePointers
44334421
{
4434-
// uint32_t increaseAlignment(const uint32_t original) override {return sizeof(void*);}
4435-
4436-
size_t operator()(void* dst, const size_t offsetInRange, const size_t blockSize)
4422+
struct FillInstances : IUtilities::IUpstreamingDataProducer
44374423
{
4438-
assert(false);
4439-
return 0ul;
4440-
}
4441-
};
4442-
FillInstancePointers fillInstancePointers;
4443-
bool success = params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,fillInstancePointers);
4444-
range.offset = offsets[1];
4445-
range.size = sizes[1];
4446-
struct FillInstances
4447-
{
4448-
// uint32_t increaseAlignment(const uint32_t original) override {return sizeof(void*);}
4424+
uint32_t operator()(void* dst, const size_t offsetInRange, const uint32_t blockSize) override
4425+
{
4426+
using blas_ref_t = IGPUBottomLevelAccelerationStructure::device_op_ref_t;
4427+
assert(offsetInRange%16==0);
4428+
4429+
uint32_t bytesWritten = 0;
4430+
while (true)
4431+
{
4432+
const auto& instance = instances[instanceIndex++];
4433+
const auto type = instance.getType();
4434+
const auto size = ITopLevelAccelerationStructure::getInstanceSize(type);
4435+
const auto newWritten = bytesWritten+size;
4436+
if (newWritten>=blockSize)
4437+
return bytesWritten;
4438+
auto blas = blasBuildMap->find(instance.getBase().blas.get())->second;
4439+
dst = IGPUTopLevelAccelerationStructure::writeInstance(dst,instance,blas.get()->getReferenceForDeviceOperations());
4440+
bytesWritten = newWritten;
4441+
}
4442+
}
44494443

4450-
size_t operator()(void* dst, const size_t offsetInRange, const size_t blockSize)
4444+
std::span<const ICPUTopLevelAccelerationStructure::PolymorphicInstance> instances;
4445+
const SReserveResult::cpu_to_gpu_blas_map_t* blasBuildMap;
4446+
uint32_t instanceIndex = 0;
4447+
};
4448+
FillInstances fillInstances;
4449+
fillInstances.instances = instances;
4450+
fillInstances.blasBuildMap = &reservations.m_blasBuildMap;
4451+
success = success && params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,fillInstances);
4452+
}
4453+
if (as->usesMotion())
4454+
{
4455+
range.offset = offsets[2];
4456+
range.size = sizes[2];
4457+
struct FillInstancePointers : IUtilities::IUpstreamingDataProducer
44514458
{
4452-
assert(false);
4453-
return 0ul;
4454-
}
4455-
};
4456-
FillInstances fillInstances;
4457-
params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,fillInstances);
4459+
uint32_t operator()(void* dst, const size_t offsetInRange, const uint32_t blockSize) override
4460+
{
4461+
constexpr uint32_t ptr_sz = sizeof(uint64_t);
4462+
4463+
const uint32_t count = blockSize/ptr_sz;
4464+
assert(offsetInRange%ptr_sz==0);
4465+
const uint32_t baseInstance = static_cast<uint32_t>(offsetInRange)/ptr_sz;
4466+
for (uint32_t i=0; i<count; i++)
4467+
{
4468+
const auto type = instances[baseInstance+i].getType();
4469+
reinterpret_cast<uint64_t*>(dst)[i] = IGPUTopLevelAccelerationStructure::encodeTypeInAddress(type,instanceAddress);
4470+
instanceAddress += ITopLevelAccelerationStructure::getInstanceSize(type);
4471+
}
4472+
return count*ptr_sz;
4473+
}
4474+
4475+
std::span<const ICPUTopLevelAccelerationStructure::PolymorphicInstance> instances;
4476+
uint64_t instanceAddress;
4477+
};
4478+
FillInstancePointers fillInstancePointers;
4479+
fillInstancePointers.instances = instances;
4480+
fillInstancePointers.instanceAddress = range.buffer->getDeviceAddress()+offsets[1];
4481+
success = success && params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,fillInstancePointers);
4482+
}
44584483
// TODO: pipeline barrier & ownership release between xfer and compute
44594484
// current recording buffer may have changed
44604485
xferCmdBuf = params.transfer->getCommandBufferForRecording();
@@ -4466,11 +4491,12 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
44664491
}
44674492
// prepare build infos
44684493
auto& buildInfo = buildInfos.emplace_back();
4469-
buildInfo.scratch = {.offset=range.offset,.buffer=smart_refctd_ptr(range.buffer)};
4494+
buildInfo.scratch = {.offset=offsets[0],.buffer = smart_refctd_ptr(range.buffer)};
44704495
buildInfo.buildFlags = tlasToBuild.getBuildFlags();
4496+
buildInfo.instanceDataTypeEncodedInPointersLSB = as->usesMotion();
44714497
buildInfo.dstAS = as;
44724498
// note we don't build directly from staging, because only very small inputs could come from there and they'd impede the transfer efficiency of the larger ones
4473-
buildInfo.instanceData = {.offset=offsets[1],.buffer=smart_refctd_ptr(range.buffer)};
4499+
buildInfo.instanceData = {.offset=offsets[as->usesMotion() ? 2:1],.buffer=smart_refctd_ptr(range.buffer)};
44744500
// be based cause vectors can grow
44754501
{
44764502
const auto offset = trackedBLASes.size();

0 commit comments

Comments
 (0)