Skip to content

Commit d69cd60

Browse files
author
devsh
committed
correct the calculation of scratch memory needed, and avoid deadlock (allocation failure) due to worst case fragmentation
also fix one memory freeing bug
1 parent 0abbb21 commit d69cd60

File tree

1 file changed

+65
-34
lines changed

1 file changed

+65
-34
lines changed

src/nbl/video/utilities/CAssetConverter.cpp

Lines changed: 65 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -2828,13 +2828,13 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
28282828
const auto buildFlags = patch.getBuildFlags(as);
28292829
const auto outIx = i+entry.second.firstCopyIx;
28302830
const auto uniqueCopyGroupID = conversionRequests.gpuObjUniqueCopyGroupIDs[outIx];
2831-
// prevent CPU hangs by making sure allocator big enough to service us in worst case but with best case allocator (no other allocations, clean alloc)
2831+
// prevent CPU hangs by making sure allocator big enough to service us in worst case
28322832
const auto minScratchAllocSize = patch.hostBuild ? inputs.scratchForHostASBuildMinAllocSize:inputs.scratchForDeviceASBuildMinAllocSize;
2833-
uint64_t buildSize = 0; uint32_t buildAlignment = 4;
2834-
auto incrementBuildSize = [minScratchAllocSize,&buildSize,&buildAlignment](const uint64_t size, const uint32_t alignment)->void
2833+
uint64_t buildSize = 0;
2834+
auto incrementBuildSize = [minScratchAllocSize,&buildSize](const uint64_t size, const uint32_t alignment)->void
28352835
{
2836-
buildSize = core::alignUp(buildSize,alignment)+hlsl::max<uint64_t>(size,minScratchAllocSize);
2837-
buildAlignment = hlsl::max(buildAlignment,alignment);
2836+
// account for fragmentation and misalignment
2837+
buildSize += hlsl::max<uint64_t>(size,minScratchAllocSize)+hlsl::max<uint32_t>(minScratchAllocSize,alignment)*2;
28382838
};
28392839
ILogicalDevice::AccelerationStructureBuildSizes sizes = {};
28402840
const auto hashAsU64 = reinterpret_cast<const uint64_t*>(entry.first.data);
@@ -2855,35 +2855,45 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
28552855
const uint32_t* pPrimitiveCounts = as->getGeometryPrimitiveCounts().data();
28562856
if (buildFlags.hasFlags(ICPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT))
28572857
{
2858-
sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,as->getAABBGeometries(),pPrimitiveCounts);
2858+
const auto geoms = as->getAABBGeometries();
2859+
sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,geoms,pPrimitiveCounts);
28592860
for (const auto& geom : geoms)
28602861
if (const auto aabbCount=*(pPrimitiveCounts++); aabbCount)
28612862
incrementBuildSize(aabbCount*geom.stride,alignof(float));
28622863
}
28632864
else
28642865
{
2865-
sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,as->getTriangleGeometries(),pPrimitiveCounts);
2866+
const auto geoms = as->getTriangleGeometries();
2867+
sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,geoms,pPrimitiveCounts);
28662868
for (const auto& geom : geoms)
28672869
if (const auto triCount=*(pPrimitiveCounts++); triCount)
28682870
{
28692871
auto size = geom.vertexStride*(geom.vertexData[1] ? 2:1)*geom.maxVertex;
2872+
uint16_t alignment = hlsl::max(0x1u<<hlsl::findLSB(geom.vertexStride),32u);
28702873
if (geom.hasTransform())
2874+
{
28712875
size = core::alignUp(size,alignof(float))+sizeof(hlsl::float32_t3x4);
2872-
auto alignment = 0u;
2876+
alignment = hlsl::max<uint16_t>(alignof(float),alignment);
2877+
}
2878+
uint16_t indexSize = 0;
28732879
switch (geom.indexType)
28742880
{
28752881
case E_INDEX_TYPE::EIT_16BIT:
2876-
alignment = alignof(uint16_t);
2882+
indexSize = sizeof(uint16_t);
28772883
break;
28782884
case E_INDEX_TYPE::EIT_32BIT:
2879-
alignment = alignof(uint32_t);
2885+
indexSize = sizeof(uint32_t);
28802886
break;
28812887
default:
28822888
break;
28832889
}
2884-
if (alignment)
2885-
size = core::alignUp(size,alignment)+triCount*3*alignment;
2886-
incrementBuildSize(size,hlsl::max(alignment,geom.vertexStride));
2890+
if (indexSize)
2891+
{
2892+
size = core::alignUp(size,indexSize)+triCount*3*indexSize;
2893+
alignment = hlsl::max<uint16_t>(indexSize,alignment);
2894+
}
2895+
inputs.logger.log("%p Triangle Data Size %d Align %d",system::ILogger::ELL_DEBUG,as,size,alignment);
2896+
incrementBuildSize(size,alignment);
28872897
}
28882898
}
28892899
}
@@ -2896,8 +2906,9 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
28962906
);
28972907
continue;
28982908
}
2899-
// scratch gets allocated first
2900-
buildSize = core::alignUp(hlsl::max<uint64_t>(sizes.buildScratchSize,minScratchAllocSize),buildAlignment)+buildSize;
2909+
//
2910+
incrementBuildSize(sizes.buildScratchSize,device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment);
2911+
inputs.logger.log("%p Scratch Size %d Combined %d",system::ILogger::ELL_DEBUG,as,sizes.buildScratchSize,buildSize);
29012912

29022913
// we need to save the buffer in a side-channel for later
29032914
auto& out = accelerationStructureParams[IsTLAS][entry.second.firstCopyIx+i];
@@ -4718,12 +4729,14 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
47184729
{
47194730
submitsNeeded |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
47204731
// queue up a deferred allocation
4721-
params.scratchForDeviceASBuild->multi_deallocate(oldAllocCount,allocOffsets.data(),allocSizes.data(),params.compute->getFutureScratchSemaphore());
4732+
if (oldAllocCount)
4733+
params.scratchForDeviceASBuild->multi_deallocate(oldAllocCount,allocOffsets.data(),allocSizes.data(),params.compute->getFutureScratchSemaphore());
47224734
}
47234735
else
47244736
{
47254737
// release right away
4726-
params.scratchForDeviceASBuild->multi_deallocate(oldAllocCount,allocOffsets.data(),allocSizes.data());
4738+
if (oldAllocCount)
4739+
params.scratchForDeviceASBuild->multi_deallocate(oldAllocCount,allocOffsets.data(),allocSizes.data());
47274740
for (const auto& info : buildInfos)
47284741
{
47294742
const auto stagingFound = findInStaging.template operator()<CPUAccelerationStructure>(info.dstAS);
@@ -4766,7 +4779,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
47664779
auto allocCount = 0;
47674780
auto deallocSrc = core::makeRAIIExiter([&params,&allocOffsets,&allocSizes,&alignments,&allocCount]()->void
47684781
{
4769-
const auto beginIx = allocSizes.size()-alignments.size();
4782+
const auto beginIx = allocSizes.size()-allocCount;
47704783
// if got to end of loop queue up the release of memory, otherwise release right away
47714784
if (allocCount)
47724785
params.scratchForDeviceASBuild->multi_deallocate(allocCount,allocOffsets.data()+beginIx,allocSizes.data()+beginIx);
@@ -4837,42 +4850,60 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
48374850
if (const auto triCount=*(pPrimitiveCounts++); triCount)
48384851
{
48394852
auto size = geom.vertexStride*(geom.vertexData[1] ? 2:1)*geom.maxVertex;
4853+
uint16_t alignment = hlsl::max(0x1u<<hlsl::findLSB(geom.vertexStride),32u);
48404854
if (geom.hasTransform())
4855+
{
48414856
size = core::alignUp(size,alignof(float))+sizeof(hlsl::float32_t3x4);
4842-
auto alignment = 0u;
4857+
alignment = hlsl::max<uint16_t>(alignof(float),alignment);
4858+
}
4859+
uint16_t indexSize = 0u;
48434860
switch (geom.indexType)
48444861
{
48454862
case E_INDEX_TYPE::EIT_16BIT:
4846-
alignment = alignof(uint16_t);
4863+
indexSize = alignof(uint16_t);
48474864
break;
48484865
case E_INDEX_TYPE::EIT_32BIT:
4849-
alignment = alignof(uint32_t);
4866+
indexSize = alignof(uint32_t);
48504867
break;
48514868
default:
48524869
break;
48534870
}
4854-
if (alignment)
4855-
size = core::alignUp(size,alignment)+triCount*3*alignment;
4871+
if (indexSize)
4872+
{
4873+
size = core::alignUp(size,indexSize)+triCount*3*indexSize;
4874+
alignment = hlsl::max<uint16_t>(indexSize,alignment);
4875+
}
48564876
allocSizes.push_back(size);
4857-
alignments.push_back(hlsl::max(alignment,geom.vertexStride));
4877+
alignments.push_back(alignment);
4878+
const auto tmp = asToBuild.second.scratchSize;
4879+
logger.log("%p Triangle Data Size %d Align %d Scratch Size %d",system::ILogger::ELL_DEBUG,canonical.get(),size,alignment,tmp);
48584880
}
48594881
}
48604882
}
48614883
allocOffsets.resize(allocSizes.size(),scratch_allocator_t::invalid_value);
48624884
// allocate out scratch or submit overflow, if fail then flush and keep trying till space is made
4863-
auto* const offsets = allocOffsets.data()+allocOffsets.size()-alignments.size();
4864-
const auto* const sizes = allocSizes.data()+allocSizes.size()-alignments.size();
4885+
auto* offsets = allocOffsets.data()+allocOffsets.size()-alignments.size();
4886+
const auto* sizes = allocSizes.data()+allocSizes.size()-alignments.size();
4887+
logger.log("%p Combined Size %d",system::ILogger::ELL_DEBUG,canonical.get(),std::accumulate(sizes,sizes+alignments.size(),0));
48654888
for (uint32_t t=0; params.scratchForDeviceASBuild->multi_allocate(alignments.size(),offsets,sizes,alignments.data())!=0; t++)
4866-
if (t==1) // don't flush right away cause allocator not defragmented yet
48674889
{
4868-
recordBuildCommands();
4869-
// if writing to scratch directly, flush the writes
4870-
if (!flushRanges.empty())
4890+
if (t==1) // don't flush right away cause allocator not defragmented yet
48714891
{
4872-
device->flushMappedMemoryRanges(flushRanges);
4873-
flushRanges.clear();
4892+
recordBuildCommands();
4893+
// the submit overflow deallocates old offsets and erases them from the temp arrays, pointer changes
4894+
offsets = allocOffsets.data();
4895+
sizes = allocSizes.data();
4896+
// if writing to scratch directly, flush the writes
4897+
if (!flushRanges.empty())
4898+
{
4899+
device->flushMappedMemoryRanges(flushRanges);
4900+
flushRanges.clear();
4901+
}
4902+
drainCompute();
48744903
}
4875-
drainCompute();
4904+
// we may be preventing ourselves from allocating memory, with one successful allocation still being alive and fragmenting our allocator
4905+
params.scratchForDeviceASBuild->multi_deallocate(alignments.size(),offsets,sizes);
4906+
std::fill_n(offsets,alignments.size(),scratch_allocator_t::invalid_value);
48764907
}
48774908
// now upon a failure, our allocations will need to be deallocated
48784909
allocCount = alignments.size();
@@ -5055,7 +5086,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
50555086
outGeom.indexType = geom.indexType;
50565087
outGeom.geometryFlags = geom.geometryFlags;
50575088
}
5058-
buildInfo.triangles = reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>* const&>(trianglesOffset);
5089+
buildInfo.triangles = reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<IGPUBuffer>* const&>(trianglesOffset);
50595090
}
50605091
success = pPrimitiveCounts==primitiveCounts.data()+primitiveCounts.size();
50615092
rangeInfos.push_back(reinterpret_cast<const IGPUBottomLevelAccelerationStructure::BuildRangeInfo* const&>(geometryRangeInfoOffset));

0 commit comments

Comments
 (0)