@@ -43,16 +43,15 @@ auto fillIotaDescriptorBindingDeclarations = [](auto* outBindings, uint32_t acce
43
43
}
44
44
};
45
45
46
-
47
46
Renderer::Renderer (IVideoDriver* _driver, IAssetManager* _assetManager, scene::ISceneManager* _smgr, bool useDenoiser) :
48
47
m_useDenoiser(useDenoiser), m_driver(_driver), m_smgr(_smgr), m_assetManager(_assetManager),
49
48
m_rrManager(ext::RadeonRays::Manager::create(m_driver)),
50
49
#ifdef _NBL_BUILD_OPTIX_
51
50
m_optixManager (), m_cudaStream(nullptr ), m_optixContext(),
52
51
#endif
53
52
m_prevView (), m_sceneBound(FLT_MAX,FLT_MAX,FLT_MAX,-FLT_MAX,-FLT_MAX,-FLT_MAX),
54
- m_maxRaysPerDispatch( 0 ), m_framesDispatched(0u ), m_rcpPixelSize{0 .f ,0 .f },
55
- m_staticViewData{{0 .f ,0 .f ,0 .f },0u ,{0u ,0u },0u ,0u }, m_raytraceCommonData{vec3 (),0 .f ,0u ,0u ,0u ,0u , 0u },
53
+ m_framesDispatched(0u ), m_rcpPixelSize{0 .f ,0 .f },
54
+ m_staticViewData{{0 .f ,0 .f ,0 .f },0u ,{0u ,0u },0u ,0u }, m_raytraceCommonData{vec3 (),0 .f ,0u ,0u ,0u ,0u },
56
55
m_indirectDrawBuffers{nullptr },m_cullPushConstants{core::matrix4SIMD (),1 .f ,0u ,0u ,0u },m_cullWorkGroups(0u ),
57
56
m_raygenWorkGroups{0u ,0u },m_visibilityBuffer(nullptr ),m_colorBuffer(nullptr )
58
57
{
@@ -79,6 +78,22 @@ Renderer::Renderer(IVideoDriver* _driver, IAssetManager* _assetManager, scene::I
79
78
break ;
80
79
}
81
80
81
+ // set up raycount buffers
82
+ {
83
+ const uint32_t zeros[RAYCOUNT_N_BUFFERING] = { 0u };
84
+ m_rayCountBuffer = m_driver->createFilledDeviceLocalGPUBufferOnDedMem (sizeof (uint32_t )*RAYCOUNT_N_BUFFERING,zeros);
85
+ IDriverMemoryBacked::SDriverMemoryRequirements reqs;
86
+ reqs.vulkanReqs .size = sizeof (uint32_t );
87
+ reqs.vulkanReqs .alignment = alignof (uint32_t );
88
+ reqs.vulkanReqs .memoryTypeBits = ~0u ;
89
+ reqs.memoryHeapLocation = IDriverMemoryAllocation::ESMT_NOT_DEVICE_LOCAL;
90
+ reqs.mappingCapability = IDriverMemoryAllocation::EMCF_COHERENT|IDriverMemoryAllocation::EMCF_CAN_MAP_FOR_READ;
91
+ reqs.prefersDedicatedAllocation = 0u ;
92
+ reqs.requiresDedicatedAllocation = 0u ;
93
+ m_littleDownloadBuffer = m_driver->createGPUBufferOnDedMem (reqs);
94
+ m_littleDownloadBuffer->getBoundMemory ()->mapMemoryRange (IDriverMemoryAllocation::EMCAF_READ,{0 ,sizeof (uint32_t )});
95
+ }
96
+
82
97
// set up Visibility Buffer pipeline
83
98
{
84
99
IGPUDescriptorSetLayout::SBinding binding;
@@ -87,11 +102,7 @@ Renderer::Renderer(IVideoDriver* _driver, IAssetManager* _assetManager, scene::I
87
102
m_rasterInstanceDataDSLayout = m_driver->createGPUDescriptorSetLayout (&binding,&binding+1u );
88
103
}
89
104
{
90
- #ifndef DISABLE_NEE
91
105
constexpr auto additionalGlobalDescriptorCount = 5u ;
92
- #else
93
- constexpr auto additionalGlobalDescriptorCount = 3u ;
94
- #endif
95
106
IGPUDescriptorSetLayout::SBinding bindings[additionalGlobalDescriptorCount];
96
107
fillIotaDescriptorBindingDeclarations (bindings,ISpecializedShader::ESS_COMPUTE|ISpecializedShader::ESS_VERTEX|ISpecializedShader::ESS_FRAGMENT,additionalGlobalDescriptorCount,asset::EDT_STORAGE_BUFFER);
97
108
@@ -136,7 +147,6 @@ Renderer::Renderer(IVideoDriver* _driver, IAssetManager* _assetManager, scene::I
136
147
bindings[5 ].type = asset::EDT_STORAGE_BUFFER;
137
148
bindings[5 ].count = 2u ;
138
149
bindings[6 ].type = asset::EDT_STORAGE_BUFFER;
139
- bindings[6 ].count = 2u ;
140
150
141
151
m_commonRaytracingDSLayout = m_driver->createGPUDescriptorSetLayout (bindings,bindings+raytracingCommonDescriptorCount);
142
152
}
@@ -734,11 +744,12 @@ void Renderer::init(const SAssetBundle& meshes, core::smart_refctd_ptr<ICPUBuffe
734
744
const bool success = extractIntegratorInfo (initData.globalMeta ->m_global .m_integrator ,bxdfSamples,maxNEESamples);
735
745
assert (success && " unsupported integrator type" );
736
746
737
- auto setRayBufferSizes = [&bxdfSamples,&maxNEESamples,renderPixelCount,this ,&raygenBufferSize,&intersectionBufferSize](uint32_t sampleMultiplier) -> void
747
+ uint32_t _maxRaysPerDispatch = 0u ;
748
+ auto setRayBufferSizes = [&bxdfSamples,&maxNEESamples,renderPixelCount,this ,&_maxRaysPerDispatch,&raygenBufferSize,&intersectionBufferSize](uint32_t sampleMultiplier) -> void
738
749
{
739
750
m_staticViewData.samplesPerPixelPerDispatch = (bxdfSamples+maxNEESamples)*sampleMultiplier;
740
751
const size_t minimumSampleCountPerDispatch = static_cast <size_t >(renderPixelCount)*m_staticViewData.samplesPerPixelPerDispatch ;
741
- m_maxRaysPerDispatch = static_cast <uint32_t >(minimumSampleCountPerDispatch);
752
+ _maxRaysPerDispatch = static_cast <uint32_t >(minimumSampleCountPerDispatch);
742
753
const auto doubleBufferSampleCountPerDispatch = minimumSampleCountPerDispatch*2ull ;
743
754
744
755
raygenBufferSize = doubleBufferSampleCountPerDispatch*sizeof (::RadeonRays::ray);
@@ -748,7 +759,7 @@ void Renderer::init(const SAssetBundle& meshes, core::smart_refctd_ptr<ICPUBuffe
748
759
{
749
760
uint32_t sampleMultiplier = 0u ;
750
761
const auto maxSSBOSize = core::min (m_driver->getMaxSSBOSize (),256u <<20 );
751
- while (raygenBufferSize<=maxSSBOSize && intersectionBufferSize<=maxSSBOSize) // for AMD && m_maxRaysPerDispatch *WORKGROUP_SIZE<=64<<10))
762
+ while (raygenBufferSize<=maxSSBOSize && intersectionBufferSize<=maxSSBOSize) // for AMD && _maxRaysPerDispatch *WORKGROUP_SIZE<=64<<10))
752
763
setRayBufferSizes (++sampleMultiplier);
753
764
if (sampleMultiplier==1u )
754
765
{
@@ -760,21 +771,6 @@ void Renderer::init(const SAssetBundle& meshes, core::smart_refctd_ptr<ICPUBuffe
760
771
}
761
772
}
762
773
763
- // set up raycount buffers for RR
764
- {
765
- struct RayCountData
766
- {
767
- uint32_t rayCount;
768
- DispatchIndirectCommand_t params;
769
- };
770
- RayCountData data = {m_maxRaysPerDispatch,{0u ,1u ,1u }};
771
- for (auto i=0u ; i<2u ; i++)
772
- {
773
- m_rayCountBuffer[i].buffer = m_driver->createFilledDeviceLocalGPUBufferOnDedMem (sizeof (RayCountData),&data);
774
- m_rayCountBuffer[i].asRRBuffer = m_rrManager->linkBuffer (m_rayCountBuffer[i].buffer .get (),CL_MEM_READ_ONLY);
775
- }
776
- }
777
-
778
774
// create out screen-sized textures
779
775
m_accumulation = createScreenSizedTexture (EF_R32G32_UINT,m_staticViewData.samplesPerPixelPerDispatch );
780
776
m_tonemapOutput = createScreenSizedTexture (EF_A2B10G10R10_UNORM_PACK32);
@@ -896,7 +892,6 @@ void Renderer::init(const SAssetBundle& meshes, core::smart_refctd_ptr<ICPUBuffe
896
892
897
893
IGPUDescriptorSet::SDescriptorInfo infos[descriptorUpdateMaxCount];
898
894
IGPUDescriptorSet::SWriteDescriptorSet writes[descriptorUpdateMaxCount];
899
- #ifndef DISABLE_NEE
900
895
// set up rest of m_additionalGlobalDS
901
896
{
902
897
createFilledBufferAndSetUpInfoFromVector (infos+0 ,initData.lightCDF );
@@ -905,7 +900,7 @@ void Renderer::init(const SAssetBundle& meshes, core::smart_refctd_ptr<ICPUBuffe
905
900
setDstSetAndDescTypesOnWrites (m_additionalGlobalDS.get (),writes,infos,{EDT_STORAGE_BUFFER,EDT_STORAGE_BUFFER},3u );
906
901
}
907
902
m_driver->updateDescriptorSets (descriptorUpdateCounts[0 ],writes,0u ,nullptr );
908
- # endif
903
+
909
904
// set up m_commonRaytracingDS
910
905
core::smart_refctd_ptr<IGPUBuffer> _staticViewDataBuffer;
911
906
{
@@ -946,8 +941,7 @@ void Renderer::init(const SAssetBundle& meshes, core::smart_refctd_ptr<ICPUBuffe
946
941
setImageInfo (infos+4 ,asset::EIL_GENERAL,core::smart_refctd_ptr (m_accumulation));
947
942
createEmptyInteropBufferAndSetUpInfo (infos+5 ,m_rayBuffer[0 ],raygenBufferSize);
948
943
createEmptyInteropBufferAndSetUpInfo (infos+6 ,m_rayBuffer[1 ],raygenBufferSize);
949
- setBufferInfo (infos+7 ,m_rayCountBuffer[0 ].buffer );
950
- setBufferInfo (infos+8 ,m_rayCountBuffer[1 ].buffer );
944
+ setBufferInfo (infos+7 ,m_rayCountBuffer);
951
945
952
946
setDstSetAndDescTypesOnWrites (m_commonRaytracingDS.get (),writes,infos,{
953
947
EDT_UNIFORM_BUFFER,
@@ -959,7 +953,6 @@ void Renderer::init(const SAssetBundle& meshes, core::smart_refctd_ptr<ICPUBuffe
959
953
EDT_STORAGE_BUFFER
960
954
});
961
955
writes[5 ].count = 2u ;
962
- writes[6 ].count = 2u ;
963
956
writes[6 ].info = infos+7 ;
964
957
}
965
958
initData = {}; // reclaim some memory
@@ -1107,7 +1100,6 @@ void Renderer::deinit()
1107
1100
};
1108
1101
deleteInteropBuffer (m_intersectionBuffer[i]);
1109
1102
deleteInteropBuffer (m_rayBuffer[i]);
1110
- deleteInteropBuffer (m_rayCountBuffer[i]);
1111
1103
}
1112
1104
1113
1105
m_raygenWorkGroups[0 ] = m_raygenWorkGroups[1 ] = 0u ;
@@ -1137,7 +1129,6 @@ void Renderer::deinit()
1137
1129
m_staticViewData = {{0 .f ,0 .f ,0 .f },0u ,{0u ,0u },0u ,0u };
1138
1130
m_rcpPixelSize = {0 .f ,0 .f };
1139
1131
m_framesDispatched = 0u ;
1140
- m_maxRaysPerDispatch = 0u ;
1141
1132
std::fill_n (m_prevView.pointer (),12u ,0 .f );
1142
1133
m_sceneBound = core::aabbox3df (FLT_MAX, FLT_MAX, FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX);
1143
1134
@@ -1268,8 +1259,9 @@ void Renderer::render(nbl::ITimer* timer)
1268
1259
}
1269
1260
// path trace
1270
1261
m_raytraceCommonData.depth = 0u ;
1262
+ uint32_t nextTraceRaycount = 0xdeadbeefu ; // the raygen shader doesn't care
1271
1263
while (m_raytraceCommonData.depth !=m_maxDepth)
1272
- traceBounce ();
1264
+ nextTraceRaycount = traceBounce (nextTraceRaycount );
1273
1265
1274
1266
// resolve pseudo-MSAA
1275
1267
{
@@ -1332,8 +1324,11 @@ void Renderer::render(nbl::ITimer* timer)
1332
1324
}
1333
1325
1334
1326
1335
- void Renderer::traceBounce ()
1327
+ uint32_t Renderer::traceBounce (uint32_t raycount )
1336
1328
{
1329
+ if (raycount==0u )
1330
+ return 0u ;
1331
+
1337
1332
const uint32_t readIx = (++m_raytraceCommonData.depth )&0x1u ;
1338
1333
const uint32_t writeIx = readIx^0x1u ;
1339
1334
// trace bounce (accumulate contributions and optionally generate rays)
@@ -1348,7 +1343,7 @@ void Renderer::traceBounce()
1348
1343
descriptorSets[3 ] = m_closestHitDS.get ();
1349
1344
m_driver->bindDescriptorSets (EPBP_COMPUTE,pipelineLayout,0u ,4u ,descriptorSets,nullptr );
1350
1345
m_driver->bindComputePipeline (m_closestHitPipeline.get ());
1351
- m_driver->dispatchIndirect (m_rayCountBuffer[readIx]. buffer . get (), sizeof ( uint32_t ) );
1346
+ m_driver->dispatch ((raycount- 1u )/WORKGROUP_SIZE+ 1u , 1u , 1u );
1352
1347
}
1353
1348
else
1354
1349
{
@@ -1360,18 +1355,17 @@ void Renderer::traceBounce()
1360
1355
// probably wise to flush all caches (in the future can optimize to texture_fetch|shader_image_access|shader_storage_buffer|blit|texture_download|...)
1361
1356
COpenGLExtensionHandler::pGlMemoryBarrier (GL_ALL_BARRIER_BITS);
1362
1357
}
1363
- // TODO: triple buffer the `m_rayCountBuffer` (clear,read,write)
1364
- m_driver->fillBuffer (m_rayCountBuffer[readIx].buffer .get (),0u ,sizeof (uint32_t )*2u ,0u );
1365
1358
// trace rays
1366
1359
if (m_raytraceCommonData.depth !=m_maxDepth)
1367
1360
{
1368
- if (m_rrManager->hasImplicitCL2GLSync ())
1369
- glFlush (); // sync CL to GL
1370
- else
1371
- glFinish (); // sync CPU to GL
1361
+ m_driver->copyBuffer (m_rayCountBuffer.get (),m_littleDownloadBuffer.get (),sizeof (uint32_t )*m_raytraceCommonData.rayCountWriteIx ,0u ,sizeof (uint32_t ));
1362
+ static_assert (core::isPoT (RAYCOUNT_N_BUFFERING)," Raycount Buffer needs to be PoT sized!" );
1363
+ m_raytraceCommonData.rayCountWriteIx = (++m_raytraceCommonData.rayCountWriteIx )&RAYCOUNT_N_BUFFERING_MASK;
1364
+ glFinish (); // sync CPU to GL
1365
+ const uint32_t nextTraceRaycount = *reinterpret_cast <uint32_t *>(m_littleDownloadBuffer->getBoundMemory ()->getMappedPointer ());
1372
1366
1373
1367
auto commandQueue = m_rrManager->getCLCommandQueue ();
1374
- const cl_mem clObjects[] = {m_rayBuffer[writeIx].asRRBuffer .second ,m_rayCountBuffer[writeIx]. asRRBuffer . second , m_intersectionBuffer[writeIx].asRRBuffer .second };
1368
+ const cl_mem clObjects[] = {m_rayBuffer[writeIx].asRRBuffer .second ,m_intersectionBuffer[writeIx].asRRBuffer .second };
1375
1369
const auto objCount = sizeof (clObjects)/sizeof (cl_mem);
1376
1370
cl_event acquired=nullptr , raycastDone=nullptr ;
1377
1371
// run the raytrace queries
@@ -1380,28 +1374,21 @@ void Renderer::traceBounce()
1380
1374
1381
1375
clEnqueueWaitForEvents (commandQueue,1u ,&acquired);
1382
1376
m_rrManager->getRadeonRaysAPI ()->QueryIntersection (
1383
- m_rayBuffer[writeIx].asRRBuffer .first ,
1384
- m_rayCountBuffer[writeIx].asRRBuffer .first ,m_maxRaysPerDispatch,
1377
+ m_rayBuffer[writeIx].asRRBuffer .first ,nextTraceRaycount,
1385
1378
m_intersectionBuffer[writeIx].asRRBuffer .first ,nullptr ,nullptr
1386
1379
);
1387
1380
clEnqueueMarker (commandQueue,&raycastDone);
1388
1381
}
1389
1382
1390
- if (m_rrManager->hasImplicitCL2GLSync ())
1391
- {
1392
- // sync GL to CL
1393
- ocl::COpenCLHandler::ocl.pclEnqueueReleaseGLObjects (commandQueue, objCount, clObjects, 1u , &raycastDone, nullptr );
1394
- ocl::COpenCLHandler::ocl.pclFlush (commandQueue);
1395
- }
1396
- else
1397
- {
1398
- // sync CPU to CL
1399
- cl_event released;
1400
- ocl::COpenCLHandler::ocl.pclEnqueueReleaseGLObjects (commandQueue, objCount, clObjects, 1u , &raycastDone, &released);
1401
- ocl::COpenCLHandler::ocl.pclFlush (commandQueue);
1402
- ocl::COpenCLHandler::ocl.pclWaitForEvents (1u ,&released);
1403
- }
1383
+ // sync CPU to CL
1384
+ cl_event released;
1385
+ ocl::COpenCLHandler::ocl.pclEnqueueReleaseGLObjects (commandQueue, objCount, clObjects, 1u , &raycastDone, &released);
1386
+ ocl::COpenCLHandler::ocl.pclFlush (commandQueue);
1387
+ ocl::COpenCLHandler::ocl.pclWaitForEvents (1u ,&released);
1388
+ return nextTraceRaycount;
1404
1389
}
1390
+ else
1391
+ return 0u ;
1405
1392
}
1406
1393
1407
1394
0 commit comments