Skip to content

Commit db31665

Browse files
stopped pretending this could all be driven by dispatch indirect
1 parent 80c3f4c commit db31665

File tree

7 files changed

+109
-89
lines changed

7 files changed

+109
-89
lines changed

examples_tests/22.RaytracedAO/Renderer.cpp

Lines changed: 46 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -43,16 +43,15 @@ auto fillIotaDescriptorBindingDeclarations = [](auto* outBindings, uint32_t acce
4343
}
4444
};
4545

46-
4746
Renderer::Renderer(IVideoDriver* _driver, IAssetManager* _assetManager, scene::ISceneManager* _smgr, bool useDenoiser) :
4847
m_useDenoiser(useDenoiser), m_driver(_driver), m_smgr(_smgr), m_assetManager(_assetManager),
4948
m_rrManager(ext::RadeonRays::Manager::create(m_driver)),
5049
#ifdef _NBL_BUILD_OPTIX_
5150
m_optixManager(), m_cudaStream(nullptr), m_optixContext(),
5251
#endif
5352
m_prevView(), m_sceneBound(FLT_MAX,FLT_MAX,FLT_MAX,-FLT_MAX,-FLT_MAX,-FLT_MAX),
54-
m_maxRaysPerDispatch(0), m_framesDispatched(0u), m_rcpPixelSize{0.f,0.f},
55-
m_staticViewData{{0.f,0.f,0.f},0u,{0u,0u},0u,0u}, m_raytraceCommonData{vec3(),0.f,0u,0u,0u,0u,0u},
53+
m_framesDispatched(0u), m_rcpPixelSize{0.f,0.f},
54+
m_staticViewData{{0.f,0.f,0.f},0u,{0u,0u},0u,0u}, m_raytraceCommonData{vec3(),0.f,0u,0u,0u,0u},
5655
m_indirectDrawBuffers{nullptr},m_cullPushConstants{core::matrix4SIMD(),1.f,0u,0u,0u},m_cullWorkGroups(0u),
5756
m_raygenWorkGroups{0u,0u},m_visibilityBuffer(nullptr),m_colorBuffer(nullptr)
5857
{
@@ -79,6 +78,22 @@ Renderer::Renderer(IVideoDriver* _driver, IAssetManager* _assetManager, scene::I
7978
break;
8079
}
8180

81+
// set up raycount buffers
82+
{
83+
const uint32_t zeros[RAYCOUNT_N_BUFFERING] = { 0u };
84+
m_rayCountBuffer = m_driver->createFilledDeviceLocalGPUBufferOnDedMem(sizeof(uint32_t)*RAYCOUNT_N_BUFFERING,zeros);
85+
IDriverMemoryBacked::SDriverMemoryRequirements reqs;
86+
reqs.vulkanReqs.size = sizeof(uint32_t);
87+
reqs.vulkanReqs.alignment = alignof(uint32_t);
88+
reqs.vulkanReqs.memoryTypeBits = ~0u;
89+
reqs.memoryHeapLocation = IDriverMemoryAllocation::ESMT_NOT_DEVICE_LOCAL;
90+
reqs.mappingCapability = IDriverMemoryAllocation::EMCF_COHERENT|IDriverMemoryAllocation::EMCF_CAN_MAP_FOR_READ;
91+
reqs.prefersDedicatedAllocation = 0u;
92+
reqs.requiresDedicatedAllocation = 0u;
93+
m_littleDownloadBuffer = m_driver->createGPUBufferOnDedMem(reqs);
94+
m_littleDownloadBuffer->getBoundMemory()->mapMemoryRange(IDriverMemoryAllocation::EMCAF_READ,{0,sizeof(uint32_t)});
95+
}
96+
8297
// set up Visibility Buffer pipeline
8398
{
8499
IGPUDescriptorSetLayout::SBinding binding;
@@ -87,11 +102,7 @@ Renderer::Renderer(IVideoDriver* _driver, IAssetManager* _assetManager, scene::I
87102
m_rasterInstanceDataDSLayout = m_driver->createGPUDescriptorSetLayout(&binding,&binding+1u);
88103
}
89104
{
90-
#ifndef DISABLE_NEE
91105
constexpr auto additionalGlobalDescriptorCount = 5u;
92-
#else
93-
constexpr auto additionalGlobalDescriptorCount = 3u;
94-
#endif
95106
IGPUDescriptorSetLayout::SBinding bindings[additionalGlobalDescriptorCount];
96107
fillIotaDescriptorBindingDeclarations(bindings,ISpecializedShader::ESS_COMPUTE|ISpecializedShader::ESS_VERTEX|ISpecializedShader::ESS_FRAGMENT,additionalGlobalDescriptorCount,asset::EDT_STORAGE_BUFFER);
97108

@@ -136,7 +147,6 @@ Renderer::Renderer(IVideoDriver* _driver, IAssetManager* _assetManager, scene::I
136147
bindings[5].type = asset::EDT_STORAGE_BUFFER;
137148
bindings[5].count = 2u;
138149
bindings[6].type = asset::EDT_STORAGE_BUFFER;
139-
bindings[6].count = 2u;
140150

141151
m_commonRaytracingDSLayout = m_driver->createGPUDescriptorSetLayout(bindings,bindings+raytracingCommonDescriptorCount);
142152
}
@@ -734,11 +744,12 @@ void Renderer::init(const SAssetBundle& meshes, core::smart_refctd_ptr<ICPUBuffe
734744
const bool success = extractIntegratorInfo(initData.globalMeta->m_global.m_integrator,bxdfSamples,maxNEESamples);
735745
assert(success && "unsupported integrator type");
736746

737-
auto setRayBufferSizes = [&bxdfSamples,&maxNEESamples,renderPixelCount,this,&raygenBufferSize,&intersectionBufferSize](uint32_t sampleMultiplier) -> void
747+
uint32_t _maxRaysPerDispatch = 0u;
748+
auto setRayBufferSizes = [&bxdfSamples,&maxNEESamples,renderPixelCount,this,&_maxRaysPerDispatch,&raygenBufferSize,&intersectionBufferSize](uint32_t sampleMultiplier) -> void
738749
{
739750
m_staticViewData.samplesPerPixelPerDispatch = (bxdfSamples+maxNEESamples)*sampleMultiplier;
740751
const size_t minimumSampleCountPerDispatch = static_cast<size_t>(renderPixelCount)*m_staticViewData.samplesPerPixelPerDispatch;
741-
m_maxRaysPerDispatch = static_cast<uint32_t>(minimumSampleCountPerDispatch);
752+
_maxRaysPerDispatch = static_cast<uint32_t>(minimumSampleCountPerDispatch);
742753
const auto doubleBufferSampleCountPerDispatch = minimumSampleCountPerDispatch*2ull;
743754

744755
raygenBufferSize = doubleBufferSampleCountPerDispatch*sizeof(::RadeonRays::ray);
@@ -748,7 +759,7 @@ void Renderer::init(const SAssetBundle& meshes, core::smart_refctd_ptr<ICPUBuffe
748759
{
749760
uint32_t sampleMultiplier = 0u;
750761
const auto maxSSBOSize = core::min(m_driver->getMaxSSBOSize(),256u<<20);
751-
while (raygenBufferSize<=maxSSBOSize && intersectionBufferSize<=maxSSBOSize) // for AMD && m_maxRaysPerDispatch*WORKGROUP_SIZE<=64<<10))
762+
while (raygenBufferSize<=maxSSBOSize && intersectionBufferSize<=maxSSBOSize) // for AMD && _maxRaysPerDispatch*WORKGROUP_SIZE<=64<<10))
752763
setRayBufferSizes(++sampleMultiplier);
753764
if (sampleMultiplier==1u)
754765
{
@@ -760,21 +771,6 @@ void Renderer::init(const SAssetBundle& meshes, core::smart_refctd_ptr<ICPUBuffe
760771
}
761772
}
762773

763-
// set up raycount buffers for RR
764-
{
765-
struct RayCountData
766-
{
767-
uint32_t rayCount;
768-
DispatchIndirectCommand_t params;
769-
};
770-
RayCountData data = {m_maxRaysPerDispatch,{0u,1u,1u}};
771-
for (auto i=0u; i<2u; i++)
772-
{
773-
m_rayCountBuffer[i].buffer = m_driver->createFilledDeviceLocalGPUBufferOnDedMem(sizeof(RayCountData),&data);
774-
m_rayCountBuffer[i].asRRBuffer = m_rrManager->linkBuffer(m_rayCountBuffer[i].buffer.get(),CL_MEM_READ_ONLY);
775-
}
776-
}
777-
778774
// create out screen-sized textures
779775
m_accumulation = createScreenSizedTexture(EF_R32G32_UINT,m_staticViewData.samplesPerPixelPerDispatch);
780776
m_tonemapOutput = createScreenSizedTexture(EF_A2B10G10R10_UNORM_PACK32);
@@ -896,7 +892,6 @@ void Renderer::init(const SAssetBundle& meshes, core::smart_refctd_ptr<ICPUBuffe
896892

897893
IGPUDescriptorSet::SDescriptorInfo infos[descriptorUpdateMaxCount];
898894
IGPUDescriptorSet::SWriteDescriptorSet writes[descriptorUpdateMaxCount];
899-
#ifndef DISABLE_NEE
900895
// set up rest of m_additionalGlobalDS
901896
{
902897
createFilledBufferAndSetUpInfoFromVector(infos+0,initData.lightCDF);
@@ -905,7 +900,7 @@ void Renderer::init(const SAssetBundle& meshes, core::smart_refctd_ptr<ICPUBuffe
905900
setDstSetAndDescTypesOnWrites(m_additionalGlobalDS.get(),writes,infos,{EDT_STORAGE_BUFFER,EDT_STORAGE_BUFFER},3u);
906901
}
907902
m_driver->updateDescriptorSets(descriptorUpdateCounts[0],writes,0u,nullptr);
908-
#endif
903+
909904
// set up m_commonRaytracingDS
910905
core::smart_refctd_ptr<IGPUBuffer> _staticViewDataBuffer;
911906
{
@@ -946,8 +941,7 @@ void Renderer::init(const SAssetBundle& meshes, core::smart_refctd_ptr<ICPUBuffe
946941
setImageInfo(infos+4,asset::EIL_GENERAL,core::smart_refctd_ptr(m_accumulation));
947942
createEmptyInteropBufferAndSetUpInfo(infos+5,m_rayBuffer[0],raygenBufferSize);
948943
createEmptyInteropBufferAndSetUpInfo(infos+6,m_rayBuffer[1],raygenBufferSize);
949-
setBufferInfo(infos+7,m_rayCountBuffer[0].buffer);
950-
setBufferInfo(infos+8,m_rayCountBuffer[1].buffer);
944+
setBufferInfo(infos+7,m_rayCountBuffer);
951945

952946
setDstSetAndDescTypesOnWrites(m_commonRaytracingDS.get(),writes,infos,{
953947
EDT_UNIFORM_BUFFER,
@@ -959,7 +953,6 @@ void Renderer::init(const SAssetBundle& meshes, core::smart_refctd_ptr<ICPUBuffe
959953
EDT_STORAGE_BUFFER
960954
});
961955
writes[5].count = 2u;
962-
writes[6].count = 2u;
963956
writes[6].info = infos+7;
964957
}
965958
initData = {}; // reclaim some memory
@@ -1107,7 +1100,6 @@ void Renderer::deinit()
11071100
};
11081101
deleteInteropBuffer(m_intersectionBuffer[i]);
11091102
deleteInteropBuffer(m_rayBuffer[i]);
1110-
deleteInteropBuffer(m_rayCountBuffer[i]);
11111103
}
11121104

11131105
m_raygenWorkGroups[0] = m_raygenWorkGroups[1] = 0u;
@@ -1137,7 +1129,6 @@ void Renderer::deinit()
11371129
m_staticViewData = {{0.f,0.f,0.f},0u,{0u,0u},0u,0u};
11381130
m_rcpPixelSize = {0.f,0.f};
11391131
m_framesDispatched = 0u;
1140-
m_maxRaysPerDispatch = 0u;
11411132
std::fill_n(m_prevView.pointer(),12u,0.f);
11421133
m_sceneBound = core::aabbox3df(FLT_MAX, FLT_MAX, FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX);
11431134

@@ -1268,8 +1259,9 @@ void Renderer::render(nbl::ITimer* timer)
12681259
}
12691260
// path trace
12701261
m_raytraceCommonData.depth = 0u;
1262+
uint32_t nextTraceRaycount = 0xdeadbeefu; // the raygen shader doesn't care
12711263
while (m_raytraceCommonData.depth!=m_maxDepth)
1272-
traceBounce();
1264+
nextTraceRaycount = traceBounce(nextTraceRaycount);
12731265

12741266
// resolve pseudo-MSAA
12751267
{
@@ -1332,8 +1324,11 @@ void Renderer::render(nbl::ITimer* timer)
13321324
}
13331325

13341326

1335-
void Renderer::traceBounce()
1327+
uint32_t Renderer::traceBounce(uint32_t raycount)
13361328
{
1329+
if (raycount==0u)
1330+
return 0u;
1331+
13371332
const uint32_t readIx = (++m_raytraceCommonData.depth)&0x1u;
13381333
const uint32_t writeIx = readIx^0x1u;
13391334
// trace bounce (accumulate contributions and optionally generate rays)
@@ -1348,7 +1343,7 @@ void Renderer::traceBounce()
13481343
descriptorSets[3] = m_closestHitDS.get();
13491344
m_driver->bindDescriptorSets(EPBP_COMPUTE,pipelineLayout,0u,4u,descriptorSets,nullptr);
13501345
m_driver->bindComputePipeline(m_closestHitPipeline.get());
1351-
m_driver->dispatchIndirect(m_rayCountBuffer[readIx].buffer.get(),sizeof(uint32_t));
1346+
m_driver->dispatch((raycount-1u)/WORKGROUP_SIZE+1u,1u,1u);
13521347
}
13531348
else
13541349
{
@@ -1360,18 +1355,17 @@ void Renderer::traceBounce()
13601355
// probably wise to flush all caches (in the future can optimize to texture_fetch|shader_image_access|shader_storage_buffer|blit|texture_download|...)
13611356
COpenGLExtensionHandler::pGlMemoryBarrier(GL_ALL_BARRIER_BITS);
13621357
}
1363-
// TODO: triple buffer the `m_rayCountBuffer` (clear,read,write)
1364-
m_driver->fillBuffer(m_rayCountBuffer[readIx].buffer.get(),0u,sizeof(uint32_t)*2u,0u);
13651358
// trace rays
13661359
if (m_raytraceCommonData.depth!=m_maxDepth)
13671360
{
1368-
if (m_rrManager->hasImplicitCL2GLSync())
1369-
glFlush(); // sync CL to GL
1370-
else
1371-
glFinish(); // sync CPU to GL
1361+
m_driver->copyBuffer(m_rayCountBuffer.get(),m_littleDownloadBuffer.get(),sizeof(uint32_t)*m_raytraceCommonData.rayCountWriteIx,0u,sizeof(uint32_t));
1362+
static_assert(core::isPoT(RAYCOUNT_N_BUFFERING),"Raycount Buffer needs to be PoT sized!");
1363+
m_raytraceCommonData.rayCountWriteIx = (++m_raytraceCommonData.rayCountWriteIx)&RAYCOUNT_N_BUFFERING_MASK;
1364+
glFinish(); // sync CPU to GL
1365+
const uint32_t nextTraceRaycount = *reinterpret_cast<uint32_t*>(m_littleDownloadBuffer->getBoundMemory()->getMappedPointer());
13721366

13731367
auto commandQueue = m_rrManager->getCLCommandQueue();
1374-
const cl_mem clObjects[] = {m_rayBuffer[writeIx].asRRBuffer.second,m_rayCountBuffer[writeIx].asRRBuffer.second,m_intersectionBuffer[writeIx].asRRBuffer.second};
1368+
const cl_mem clObjects[] = {m_rayBuffer[writeIx].asRRBuffer.second,m_intersectionBuffer[writeIx].asRRBuffer.second};
13751369
const auto objCount = sizeof(clObjects)/sizeof(cl_mem);
13761370
cl_event acquired=nullptr, raycastDone=nullptr;
13771371
// run the raytrace queries
@@ -1380,28 +1374,21 @@ void Renderer::traceBounce()
13801374

13811375
clEnqueueWaitForEvents(commandQueue,1u,&acquired);
13821376
m_rrManager->getRadeonRaysAPI()->QueryIntersection(
1383-
m_rayBuffer[writeIx].asRRBuffer.first,
1384-
m_rayCountBuffer[writeIx].asRRBuffer.first,m_maxRaysPerDispatch,
1377+
m_rayBuffer[writeIx].asRRBuffer.first,nextTraceRaycount,
13851378
m_intersectionBuffer[writeIx].asRRBuffer.first,nullptr,nullptr
13861379
);
13871380
clEnqueueMarker(commandQueue,&raycastDone);
13881381
}
13891382

1390-
if (m_rrManager->hasImplicitCL2GLSync())
1391-
{
1392-
// sync GL to CL
1393-
ocl::COpenCLHandler::ocl.pclEnqueueReleaseGLObjects(commandQueue, objCount, clObjects, 1u, &raycastDone, nullptr);
1394-
ocl::COpenCLHandler::ocl.pclFlush(commandQueue);
1395-
}
1396-
else
1397-
{
1398-
// sync CPU to CL
1399-
cl_event released;
1400-
ocl::COpenCLHandler::ocl.pclEnqueueReleaseGLObjects(commandQueue, objCount, clObjects, 1u, &raycastDone, &released);
1401-
ocl::COpenCLHandler::ocl.pclFlush(commandQueue);
1402-
ocl::COpenCLHandler::ocl.pclWaitForEvents(1u,&released);
1403-
}
1383+
// sync CPU to CL
1384+
cl_event released;
1385+
ocl::COpenCLHandler::ocl.pclEnqueueReleaseGLObjects(commandQueue, objCount, clObjects, 1u, &raycastDone, &released);
1386+
ocl::COpenCLHandler::ocl.pclFlush(commandQueue);
1387+
ocl::COpenCLHandler::ocl.pclWaitForEvents(1u,&released);
1388+
return nextTraceRaycount;
14041389
}
1390+
else
1391+
return 0u;
14051392
}
14061393

14071394

examples_tests/22.RaytracedAO/Renderer.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ class Renderer : public nbl::core::IReferenceCounted, public nbl::core::Interfac
8787

8888
nbl::core::smart_refctd_ptr<nbl::video::IGPUImageView> createScreenSizedTexture(nbl::asset::E_FORMAT format, uint32_t layers = 0u);
8989

90-
void traceBounce();
90+
uint32_t traceBounce(uint32_t raycount);
9191

9292

9393
// "constants"
@@ -108,6 +108,7 @@ class Renderer : public nbl::core::IReferenceCounted, public nbl::core::Interfac
108108

109109

110110
// persistent (intialized in constructor
111+
nbl::core::smart_refctd_ptr<nbl::video::IGPUBuffer> m_rayCountBuffer,m_littleDownloadBuffer;
111112
nbl::core::smart_refctd_ptr<nbl::video::IGPUDescriptorSetLayout> m_cullDSLayout;
112113
nbl::core::smart_refctd_ptr<const nbl::video::IGPUDescriptorSetLayout> m_perCameraRasterDSLayout;
113114
nbl::core::smart_refctd_ptr<nbl::video::IGPUDescriptorSetLayout> m_rasterInstanceDataDSLayout,m_additionalGlobalDSLayout,m_commonRaytracingDSLayout;
@@ -121,7 +122,6 @@ class Renderer : public nbl::core::IReferenceCounted, public nbl::core::Interfac
121122

122123
nbl::core::matrix3x4SIMD m_prevView;
123124
nbl::core::aabbox3df m_sceneBound;
124-
uint32_t m_maxRaysPerDispatch;
125125
uint32_t m_framesDispatched;
126126
vec2 m_rcpPixelSize;
127127
StaticViewData_t m_staticViewData;
@@ -150,7 +150,6 @@ class Renderer : public nbl::core::IReferenceCounted, public nbl::core::Interfac
150150
nbl::core::smart_refctd_ptr<nbl::video::IGPUBuffer> buffer;
151151
std::pair<::RadeonRays::Buffer*, cl_mem> asRRBuffer = { nullptr,0u };
152152
};
153-
InteropBuffer m_rayCountBuffer[2];
154153
InteropBuffer m_rayBuffer[2];
155154
InteropBuffer m_intersectionBuffer[2];
156155

examples_tests/22.RaytracedAO/closestHit.comp

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ layout(local_size_x = WORKGROUP_SIZE) in;
1010

1111
uint get_path_vertex_depth()
1212
{
13-
return bitfieldExtract(pc.cummon.samplesComputed_depth,16,16);
13+
return pc.cummon.depth;
1414
}
1515

1616
#include <nbl/builtin/glsl/ext/RadeonRays/intersection.glsl>
@@ -20,21 +20,22 @@ layout(set = 3, binding = 0, std430) restrict buffer Queries
2020
} intersections[2];
2121

2222

23-
bool get_sample_job(in uint vertex_depth_mod_2)
23+
bool get_sample_job()
2424
{
25-
return gl_GlobalInvocationID.x<traceIndirect[vertex_depth_mod_2].rayCount;
25+
return gl_GlobalInvocationID.x<rayCount[(pc.cummon.rayCountWriteIx-1u)&uint(RAYCOUNT_N_BUFFERING_MASK)];
2626
}
2727

2828
void main()
2929
{
30+
clear_raycount();
3031
bool hit = false;
31-
const uint vertex_depth = get_path_vertex_depth();
32-
const uint vertex_depth_mod_2 = vertex_depth&0x1u;
33-
if (get_sample_job(vertex_depth_mod_2))
32+
if (get_sample_job())
3433
{
3534
vec3 emissive = staticViewData.envmapBaseColor;
3635

3736
// basic reads
37+
const uint vertex_depth = get_path_vertex_depth();
38+
const uint vertex_depth_mod_2 = vertex_depth&0x1u;
3839
const nbl_glsl_ext_RadeonRays_Intersection intersection = intersections[vertex_depth_mod_2].data[gl_GlobalInvocationID.x];
3940
const nbl_glsl_ext_RadeonRays_ray ray = rays[vertex_depth_mod_2].data[gl_GlobalInvocationID.x];
4041

examples_tests/22.RaytracedAO/common.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
#define _COMMON_INCLUDED_
33

44

5-
#define DISABLE_NEE
6-
5+
#define RAYCOUNT_N_BUFFERING 4
6+
#define RAYCOUNT_N_BUFFERING_MASK (RAYCOUNT_N_BUFFERING-1)
77

88
#define MAX_TRIANGLES_IN_BATCH 512
99
#define MAX_ACCUMULATED_SAMPLES 0x10000

examples_tests/22.RaytracedAO/raygen.comp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ vec3 unpack_barycentrics(in uint data)
2121

2222
void main()
2323
{
24+
clear_raycount();
2425
bool hit = false;
2526
if (get_sample_job())
2627
{
@@ -64,11 +65,10 @@ void main()
6465
normalizedV = normalize(pc.cummon.camPos-hitWorldPos);
6566

6667
// generate rays
67-
const uint sampleID = bitfieldExtract(pc.cummon.samplesComputed_depth,0,16);
6868
const uint vertex_depth = 1u;
6969
generate_next_rays(
7070
staticViewData.samplesPerPixelPerDispatch,material,frontfacing,vertex_depth,
71-
scramble_start_state,sampleID,outPixelLocation,hitWorldPos,vec3(1.0)
71+
scramble_start_state,pc.cummon.samplesComputed,outPixelLocation,hitWorldPos,vec3(1.0)
7272
);
7373
}
7474

0 commit comments

Comments
 (0)