@@ -2828,13 +2828,13 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
2828
2828
const auto buildFlags = patch.getBuildFlags (as);
2829
2829
const auto outIx = i+entry.second .firstCopyIx ;
2830
2830
const auto uniqueCopyGroupID = conversionRequests.gpuObjUniqueCopyGroupIDs [outIx];
2831
- // prevent CPU hangs by making sure allocator big enough to service us in worst case but with best case allocator (no other allocations, clean alloc)
2831
+ // prevent CPU hangs by making sure allocator big enough to service us in worst case
2832
2832
const auto minScratchAllocSize = patch.hostBuild ? inputs.scratchForHostASBuildMinAllocSize :inputs.scratchForDeviceASBuildMinAllocSize ;
2833
- uint64_t buildSize = 0 ; uint32_t buildAlignment = 4 ;
2834
- auto incrementBuildSize = [minScratchAllocSize,&buildSize,&buildAlignment ](const uint64_t size, const uint32_t alignment)->void
2833
+ uint64_t buildSize = 0 ;
2834
+ auto incrementBuildSize = [minScratchAllocSize,&buildSize](const uint64_t size, const uint32_t alignment)->void
2835
2835
{
2836
- buildSize = core::alignUp (buildSize,alignment)+hlsl::max< uint64_t >(size,minScratchAllocSize);
2837
- buildAlignment = hlsl::max (buildAlignment, alignment);
2836
+ // account for fragmentation and misalignment
2837
+ buildSize + = hlsl::max< uint64_t >(size,minScratchAllocSize)+hlsl::max< uint32_t >(minScratchAllocSize, alignment)* 2 ;
2838
2838
};
2839
2839
ILogicalDevice::AccelerationStructureBuildSizes sizes = {};
2840
2840
const auto hashAsU64 = reinterpret_cast <const uint64_t *>(entry.first .data );
@@ -2855,35 +2855,45 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
2855
2855
const uint32_t * pPrimitiveCounts = as->getGeometryPrimitiveCounts ().data ();
2856
2856
if (buildFlags.hasFlags (ICPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT))
2857
2857
{
2858
- sizes = device->getAccelerationStructureBuildSizes (patch.hostBuild ,buildFlags,motionBlur,as->getAABBGeometries (),pPrimitiveCounts);
2858
+ const auto geoms = as->getAABBGeometries ();
2859
+ sizes = device->getAccelerationStructureBuildSizes (patch.hostBuild ,buildFlags,motionBlur,geoms,pPrimitiveCounts);
2859
2860
for (const auto & geom : geoms)
2860
2861
if (const auto aabbCount=*(pPrimitiveCounts++); aabbCount)
2861
2862
incrementBuildSize (aabbCount*geom.stride ,alignof (float ));
2862
2863
}
2863
2864
else
2864
2865
{
2865
- sizes = device->getAccelerationStructureBuildSizes (patch.hostBuild ,buildFlags,motionBlur,as->getTriangleGeometries (),pPrimitiveCounts);
2866
+ const auto geoms = as->getTriangleGeometries ();
2867
+ sizes = device->getAccelerationStructureBuildSizes (patch.hostBuild ,buildFlags,motionBlur,geoms,pPrimitiveCounts);
2866
2868
for (const auto & geom : geoms)
2867
2869
if (const auto triCount=*(pPrimitiveCounts++); triCount)
2868
2870
{
2869
2871
auto size = geom.vertexStride *(geom.vertexData [1 ] ? 2 :1 )*geom.maxVertex ;
2872
+ uint16_t alignment = hlsl::max (0x1u <<hlsl::findLSB (geom.vertexStride ),32u );
2870
2873
if (geom.hasTransform ())
2874
+ {
2871
2875
size = core::alignUp (size,alignof (float ))+sizeof (hlsl::float32_t3x4);
2872
- auto alignment = 0u ;
2876
+ alignment = hlsl::max<uint16_t >(alignof (float ),alignment);
2877
+ }
2878
+ uint16_t indexSize = 0 ;
2873
2879
switch (geom.indexType )
2874
2880
{
2875
2881
case E_INDEX_TYPE::EIT_16BIT:
2876
- alignment = alignof (uint16_t );
2882
+ indexSize = sizeof (uint16_t );
2877
2883
break ;
2878
2884
case E_INDEX_TYPE::EIT_32BIT:
2879
- alignment = alignof (uint32_t );
2885
+ indexSize = sizeof (uint32_t );
2880
2886
break ;
2881
2887
default :
2882
2888
break ;
2883
2889
}
2884
- if (alignment)
2885
- size = core::alignUp (size,alignment)+triCount*3 *alignment;
2886
- incrementBuildSize (size,hlsl::max (alignment,geom.vertexStride ));
2890
+ if (indexSize)
2891
+ {
2892
+ size = core::alignUp (size,indexSize)+triCount*3 *indexSize;
2893
+ alignment = hlsl::max<uint16_t >(indexSize,alignment);
2894
+ }
2895
+ inputs.logger .log (" %p Triangle Data Size %d Align %d" ,system::ILogger::ELL_DEBUG,as,size,alignment);
2896
+ incrementBuildSize (size,alignment);
2887
2897
}
2888
2898
}
2889
2899
}
@@ -2896,8 +2906,9 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
2896
2906
);
2897
2907
continue ;
2898
2908
}
2899
- // scratch gets allocated first
2900
- buildSize = core::alignUp (hlsl::max<uint64_t >(sizes.buildScratchSize ,minScratchAllocSize),buildAlignment)+buildSize;
2909
+ //
2910
+ incrementBuildSize (sizes.buildScratchSize ,device->getPhysicalDevice ()->getLimits ().minAccelerationStructureScratchOffsetAlignment );
2911
+ inputs.logger .log (" %p Scratch Size %d Combined %d" ,system::ILogger::ELL_DEBUG,as,sizes.buildScratchSize ,buildSize);
2901
2912
2902
2913
// we need to save the buffer in a side-channel for later
2903
2914
auto & out = accelerationStructureParams[IsTLAS][entry.second .firstCopyIx +i];
@@ -4718,12 +4729,14 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4718
4729
{
4719
4730
submitsNeeded |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
4720
4731
// queue up a deferred allocation
4721
- params.scratchForDeviceASBuild ->multi_deallocate (oldAllocCount,allocOffsets.data (),allocSizes.data (),params.compute ->getFutureScratchSemaphore ());
4732
+ if (oldAllocCount)
4733
+ params.scratchForDeviceASBuild ->multi_deallocate (oldAllocCount,allocOffsets.data (),allocSizes.data (),params.compute ->getFutureScratchSemaphore ());
4722
4734
}
4723
4735
else
4724
4736
{
4725
4737
// release right away
4726
- params.scratchForDeviceASBuild ->multi_deallocate (oldAllocCount,allocOffsets.data (),allocSizes.data ());
4738
+ if (oldAllocCount)
4739
+ params.scratchForDeviceASBuild ->multi_deallocate (oldAllocCount,allocOffsets.data (),allocSizes.data ());
4727
4740
for (const auto & info : buildInfos)
4728
4741
{
4729
4742
const auto stagingFound = findInStaging.template operator ()<CPUAccelerationStructure>(info.dstAS );
@@ -4766,7 +4779,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4766
4779
auto allocCount = 0 ;
4767
4780
auto deallocSrc = core::makeRAIIExiter ([¶ms,&allocOffsets,&allocSizes,&alignments,&allocCount]()->void
4768
4781
{
4769
- const auto beginIx = allocSizes.size ()-alignments. size () ;
4782
+ const auto beginIx = allocSizes.size ()-allocCount ;
4770
4783
// if got to end of loop queue up the release of memory, otherwise release right away
4771
4784
if (allocCount)
4772
4785
params.scratchForDeviceASBuild ->multi_deallocate (allocCount,allocOffsets.data ()+beginIx,allocSizes.data ()+beginIx);
@@ -4837,42 +4850,60 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4837
4850
if (const auto triCount=*(pPrimitiveCounts++); triCount)
4838
4851
{
4839
4852
auto size = geom.vertexStride *(geom.vertexData [1 ] ? 2 :1 )*geom.maxVertex ;
4853
+ uint16_t alignment = hlsl::max (0x1u <<hlsl::findLSB (geom.vertexStride ),32u );
4840
4854
if (geom.hasTransform ())
4855
+ {
4841
4856
size = core::alignUp (size,alignof (float ))+sizeof (hlsl::float32_t3x4);
4842
- auto alignment = 0u ;
4857
+ alignment = hlsl::max<uint16_t >(alignof (float ),alignment);
4858
+ }
4859
+ uint16_t indexSize = 0u ;
4843
4860
switch (geom.indexType )
4844
4861
{
4845
4862
case E_INDEX_TYPE::EIT_16BIT:
4846
- alignment = alignof (uint16_t );
4863
+ indexSize = alignof (uint16_t );
4847
4864
break ;
4848
4865
case E_INDEX_TYPE::EIT_32BIT:
4849
- alignment = alignof (uint32_t );
4866
+ indexSize = alignof (uint32_t );
4850
4867
break ;
4851
4868
default :
4852
4869
break ;
4853
4870
}
4854
- if (alignment)
4855
- size = core::alignUp (size,alignment)+triCount*3 *alignment;
4871
+ if (indexSize)
4872
+ {
4873
+ size = core::alignUp (size,indexSize)+triCount*3 *indexSize;
4874
+ alignment = hlsl::max<uint16_t >(indexSize,alignment);
4875
+ }
4856
4876
allocSizes.push_back (size);
4857
- alignments.push_back (hlsl::max (alignment,geom.vertexStride ));
4877
+ alignments.push_back (alignment);
4878
+ const auto tmp = asToBuild.second .scratchSize ;
4879
+ logger.log (" %p Triangle Data Size %d Align %d Scratch Size %d" ,system::ILogger::ELL_DEBUG,canonical.get (),size,alignment,tmp);
4858
4880
}
4859
4881
}
4860
4882
}
4861
4883
allocOffsets.resize (allocSizes.size (),scratch_allocator_t ::invalid_value);
4862
4884
// allocate out scratch or submit overflow, if fail then flush and keep trying till space is made
4863
- auto * const offsets = allocOffsets.data ()+allocOffsets.size ()-alignments.size ();
4864
- const auto * const sizes = allocSizes.data ()+allocSizes.size ()-alignments.size ();
4885
+ auto * offsets = allocOffsets.data ()+allocOffsets.size ()-alignments.size ();
4886
+ const auto * sizes = allocSizes.data ()+allocSizes.size ()-alignments.size ();
4887
+ logger.log (" %p Combined Size %d" ,system::ILogger::ELL_DEBUG,canonical.get (),std::accumulate (sizes,sizes+alignments.size (),0 ));
4865
4888
for (uint32_t t=0 ; params.scratchForDeviceASBuild ->multi_allocate (alignments.size (),offsets,sizes,alignments.data ())!=0 ; t++)
4866
- if (t==1 ) // don't flush right away cause allocator not defragmented yet
4867
4889
{
4868
- recordBuildCommands ();
4869
- // if writing to scratch directly, flush the writes
4870
- if (!flushRanges.empty ())
4890
+ if (t==1 ) // don't flush right away cause allocator not defragmented yet
4871
4891
{
4872
- device->flushMappedMemoryRanges (flushRanges);
4873
- flushRanges.clear ();
4892
+ recordBuildCommands ();
4893
+ // the submit overflow deallocates old offsets and erases them from the temp arrays, pointer changes
4894
+ offsets = allocOffsets.data ();
4895
+ sizes = allocSizes.data ();
4896
+ // if writing to scratch directly, flush the writes
4897
+ if (!flushRanges.empty ())
4898
+ {
4899
+ device->flushMappedMemoryRanges (flushRanges);
4900
+ flushRanges.clear ();
4901
+ }
4902
+ drainCompute ();
4874
4903
}
4875
- drainCompute ();
4904
+ // we may be preventing ourselves from allocating memory, with one successful allocation still being alive and fragmenting our allocator
4905
+ params.scratchForDeviceASBuild ->multi_deallocate (alignments.size (),offsets,sizes);
4906
+ std::fill_n (offsets,alignments.size (),scratch_allocator_t ::invalid_value);
4876
4907
}
4877
4908
// now upon a failure, our allocations will need to be deallocated
4878
4909
allocCount = alignments.size ();
@@ -5055,7 +5086,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
5055
5086
outGeom.indexType = geom.indexType ;
5056
5087
outGeom.geometryFlags = geom.geometryFlags ;
5057
5088
}
5058
- buildInfo.triangles = reinterpret_cast <const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>* const &>(trianglesOffset);
5089
+ buildInfo.triangles = reinterpret_cast <const IGPUBottomLevelAccelerationStructure::Triangles<IGPUBuffer>* const &>(trianglesOffset);
5059
5090
}
5060
5091
success = pPrimitiveCounts==primitiveCounts.data ()+primitiveCounts.size ();
5061
5092
rangeInfos.push_back (reinterpret_cast <const IGPUBottomLevelAccelerationStructure::BuildRangeInfo* const &>(geometryRangeInfoOffset));
0 commit comments