Skip to content

Commit 6675224

Browse files
author
devsh
committed
starting writing the building code, realize we need to bucket the Device and Host build requests separately
1 parent 11a141c commit 6675224

File tree

1 file changed

+119
-9
lines changed

1 file changed

+119
-9
lines changed

src/nbl/video/utilities/CAssetConverter.cpp

Lines changed: 119 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3377,7 +3377,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
33773377
{
33783378
if (reqQueueFlags.hasFlags(IQueue::FAMILY_FLAGS::TRANSFER_BIT) && (!params.utilities || params.utilities->getLogicalDevice()!=device))
33793379
{
3380-
logger.log("Transfer Capability required for this conversion and no compatible `utilities` provided!", system::ILogger::ELL_ERROR);
3380+
logger.log("Transfer Capability required for this conversion and no compatible `utilities` provided!",system::ILogger::ELL_ERROR);
33813381
return retval;
33823382
}
33833383

@@ -3406,6 +3406,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
34063406
};
34073407
// If the transfer queue will be used, the transfer Intended Submit Info must be valid and utilities must be provided
34083408
auto reqTransferQueueCaps = IQueue::FAMILY_FLAGS::TRANSFER_BIT;
3409+
// Depth/Stencil transfers need Graphics Capabilities, so make sure the queue chosen for transfers also has them!
34093410
if (reservations.m_queueFlags.hasFlags(IQueue::FAMILY_FLAGS::GRAPHICS_BIT))
34103411
reqTransferQueueCaps |= IQueue::FAMILY_FLAGS::GRAPHICS_BIT;
34113412
if (invalidIntended(reqTransferQueueCaps,params.transfer))
@@ -3428,7 +3429,52 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
34283429
}
34293430
}
34303431

3431-
// wipe gpu item in staging cache (this may drop it as well if it was made for only a root asset == no users)
3432+
// check things necessary for building Acceleration Structures
3433+
using buffer_usage_f = IGPUBuffer::E_USAGE_FLAGS;
3434+
if (reservations.m_ASBuildScratchUsages!=buffer_usage_f::EUF_NONE)
3435+
{
3436+
if (!params.scratchForDeviceASBuild)
3437+
{
3438+
logger.log("An Acceleration Structure will be built on Device but no scratch allocator provided!",system::ILogger::ELL_ERROR);
3439+
return retval;
3440+
}
3441+
// TODO: do the build input buffers also need `EUF_STORAGE_BUFFER_BIT` ?
3442+
constexpr buffer_usage_f asBuildInputFlags = buffer_usage_f::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT|buffer_usage_f::EUF_TRANSFER_DST_BIT|buffer_usage_f::EUF_SHADER_DEVICE_ADDRESS_BIT;
3443+
// we may use the staging buffer directly to skip an extra copy on small enough geometries
3444+
if (!params.utilities->getDefaultUpStreamingBuffer()->getBuffer()->getCreationParams().usage.hasFlags(asBuildInputFlags))
3445+
{
3446+
logger.log("An Acceleration Structure will be built on Device but Default UpStreaming Buffer from IUtilities doesn't have required usage flags!",system::ILogger::ELL_ERROR);
3447+
return retval;
3448+
}
3449+
constexpr buffer_usage_f asBuildScratchFlags = buffer_usage_f::EUF_STORAGE_BUFFER_BIT|buffer_usage_f::EUF_SHADER_DEVICE_ADDRESS_BIT;
3450+
// we use the scratch allocator both for scratch and uploaded geometry data
3451+
if (!params.scratchForDeviceASBuild->getBuffer()->getCreationParams().usage.hasFlags(asBuildScratchFlags|asBuildInputFlags))
3452+
{
3453+
logger.log("An Acceleration Structure will be built on Device but scratch buffer doesn't have required usage flags!",system::ILogger::ELL_ERROR);
3454+
return retval;
3455+
}
3456+
const auto& addrAlloc = params.scratchForDeviceASBuild->getAddressAllocator();
3457+
// could have used an address allocator trait to work this out, same verbosity
3458+
if (addrAlloc.get_allocated_size()+addrAlloc.get_free_size()<reservations.m_minASBuildScratchSize[0])
3459+
{
3460+
logger.log("Acceleration Structure Scratch Device Memory Allocator not large enough!",system::ILogger::ELL_ERROR);
3461+
return retval;
3462+
}
3463+
}
3464+
// the elusive and exotic host builds
3465+
if (reservations.m_willHostBuildSomeAS && !params.scratchForHostASBuild)
3466+
{
3467+
logger.log("An Acceleration Structure will be built on the Host but no Scratch Memory Allocator provided!", system::ILogger::ELL_ERROR);
3468+
return retval;
3469+
}
3470+
// and compacting
3471+
if (reservations.m_willCompactSomeAS && !params.compactedASAllocator)
3472+
{
3473+
logger.log("An Acceleration Structure will be compacted but no Device Memory Allocator provided!", system::ILogger::ELL_ERROR);
3474+
return retval;
3475+
}
3476+
3477+
//
34323478
auto findInStaging = [&reservations]<Asset AssetType>(const typename asset_traits<AssetType>::video_t* gpuObj)->core::blake3_hash_t*
34333479
{
34343480
auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(reservations.m_stagingCaches);
@@ -3547,9 +3593,9 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
35473593
const auto computeFamily = shouldDoSomeCompute ? params.compute->queue->getFamilyIndex():IQueue::FamilyIgnored;
35483594
// whenever transfer needs to do a submit overflow because it ran out of memory for streaming an image, we can already submit the recorded mip-map compute shader dispatches
35493595
auto computeCmdBuf = shouldDoSomeCompute ? params.compute->getCommandBufferForRecording():nullptr;
3550-
auto drainCompute = [&params,shouldDoSomeCompute,&computeCmdBuf](const std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> extraSignal={})->auto
3596+
auto drainCompute = [&params,&computeCmdBuf](const std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> extraSignal={})->auto
35513597
{
3552-
if (!shouldDoSomeCompute || computeCmdBuf->cmdbuf->empty())
3598+
if (!computeCmdBuf || computeCmdBuf->cmdbuf->empty())
35533599
return IQueue::RESULT::SUCCESS;
35543600
// before we overflow submit we need to inject extra wait semaphores
35553601
auto& waitSemaphoreSpan = params.compute->waitSemaphores;
@@ -3568,6 +3614,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
35683614
IQueue::RESULT res = params.compute->submit(computeCmdBuf,extraSignal);
35693615
if (res!=IQueue::RESULT::SUCCESS)
35703616
return res;
3617+
// set to empty so we don't grow over and over again
3618+
waitSemaphoreSpan = {};
35713619
if (!params.compute->beginNextCommandBuffer(computeCmdBuf))
35723620
return IQueue::RESULT::OTHER_ERROR;
35733621
return res;
@@ -4039,7 +4087,65 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
40394087
imagesToUpload.clear();
40404088
}
40414089

4042-
// TODO: build BLASes and TLASes
4090+
// BLAS builds
4091+
auto& blasToBuild = std::get<SReserveResult::conversion_requests_t<ICPUBottomLevelAccelerationStructure>>(reservations.m_conversionRequests);
4092+
if (const auto blasCount = blasToBuild.size(); blasCount)
4093+
{
4094+
constexpr auto GeometryIsAABBFlag = ICPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT;
4095+
4096+
core::vector<IGPUBottomLevelAccelerationStructure::DeviceBuildInfo> buildInfos; buildInfos.reserve(blasCount);
4097+
core::vector<IGPUBottomLevelAccelerationStructure::DeviceBuildInfo> rangeInfo; rangeInfo.reserve(blasCount);
4098+
core::vector<IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> triangles;
4099+
core::vector<IGPUBottomLevelAccelerationStructure::AABBs<const IGPUBuffer>> aabbs;
4100+
{
4101+
size_t totalTriGeoCount = 0;
4102+
size_t totalAABBGeoCount = 0;
4103+
for (auto& item : blasToBuild)
4104+
{
4105+
const size_t geoCount = item.canonical->getGeometryCount();
4106+
if (item.canonical->getBuildFlags().hasFlags(GeometryIsAABBFlag))
4107+
totalAABBGeoCount += geoCount;
4108+
else
4109+
totalTriGeoCount += geoCount;
4110+
}
4111+
triangles.reserve(totalTriGeoCount);
4112+
triangles.reserve(totalAABBGeoCount);
4113+
}
4114+
for (auto& item : blasToBuild)
4115+
{
4116+
auto* as = item.gpuObj;
4117+
auto pFoundHash = findInStaging.operator()<ICPUBottomLevelAccelerationStructure>(as);
4118+
if (item.asBuildParams.host)
4119+
{
4120+
auto dOp = device->createDeferredOperation();
4121+
//
4122+
if (!device->buildAccelerationStructure(dOp.get(),info,range))
4123+
{
4124+
markFailureInStaging(gpuObj,pFoundHash);
4125+
continue;
4126+
}
4127+
}
4128+
else
4129+
{
4130+
auto& buildInfo = buildInfo.emplace_back({
4131+
.buildFlags = item.buildFlags,
4132+
.geometryCount = item.canonical->getGeometryCount(),
4133+
// this is not an update
4134+
.srcAS = nullptr,
4135+
.dstAS = as.get()
4136+
});
4137+
if (item.canonical->getBuildFlags().hasFlags(GeometryIsAABBFlag))
4138+
buildInfo.aabbs = nullptr;
4139+
else
4140+
buildInfo.triangles = nullptr;
4141+
computeCmdBuf->cmdbuf->buildAccelerationStructures(buildInfo,rangeInfo);
4142+
}
4143+
}
4144+
}
4145+
4146+
// TLAS builds
4147+
auto& tlasToBuild = std::get<SReserveResult::conversion_requests_t<ICPUTopLevelAccelerationStructure>>(reservations.m_conversionRequests);
4148+
if (!tlasToBuild.empty())
40434149
{
40444150
}
40454151

@@ -4100,6 +4206,10 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
41004206
// rescan all the GPU objects and find out if they depend on anything that failed, if so add to failure set
41014207
bool depsMissing = false;
41024208
// only go over types we could actually break via missing upload/build (i.e. pipelines are unbreakable)
4209+
if constexpr (std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>)
4210+
{
4211+
// there's no lifetime tracking (refcounting) from TLAS to BLAS, so one just must trust the pre-TLAS-build input validation to do its job
4212+
}
41034213
if constexpr (std::is_same_v<AssetType,ICPUBufferView>)
41044214
depsMissing = missingDependent.operator()<ICPUBuffer>(item.first->getUnderlyingBuffer());
41054215
if constexpr (std::is_same_v<AssetType,ICPUImageView>)
@@ -4141,8 +4251,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
41414251
depsMissing = missingDependent.operator()<ICPUBufferView>(static_cast<const IGPUBufferView*>(untypedDesc));
41424252
break;
41434253
case asset::IDescriptor::EC_ACCELERATION_STRUCTURE:
4144-
_NBL_TODO();
4145-
[[fallthrough]];
4254+
depsMissing = missingDependent.operator()<ICPUTopLevelAccelerationStructure>(static_cast<const ICPUTopLevelAccelerationStructure*>(untypedDesc));
4255+
break;
41464256
default:
41474257
assert(false);
41484258
depsMissing = true;
@@ -4170,8 +4280,8 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
41704280
// again, need to go bottom up so we can check dependencies being successes
41714281
mergeCache.operator()<ICPUBuffer>();
41724282
mergeCache.operator()<ICPUImage>();
4173-
// mergeCache.operator()<ICPUBottomLevelAccelerationStructure>();
4174-
// mergeCache.operator()<ICPUTopLevelAccelerationStructure>();
4283+
mergeCache.operator()<ICPUBottomLevelAccelerationStructure>();
4284+
mergeCache.operator()<ICPUTopLevelAccelerationStructure>();
41754285
mergeCache.operator()<ICPUBufferView>();
41764286
mergeCache.operator()<ICPUImageView>();
41774287
mergeCache.operator()<ICPUShader>();

0 commit comments

Comments
 (0)