Skip to content

Commit 54c01f4

Browse files
devsh — Today at 19:28
I FOUND AN NVIDIA SHADER COMPILER BUG! (the very first one in my over 10 year long career) Remember kids, dont index your SSBOs (even uniformly and even with NV_gpu_shader5) ```glsl layout(set = 2, binding = 5, std430) restrict buffer Rays { nbl_glsl_ext_RadeonRays_ray data[]; } rays[2]; // later rays[vertex_depth_mod_2_inv].data[outputID] = newRay; ``` this caused various vec3 members of nbl_glsl_ext_RadeonRays_ray to have 0.0 in the z coordinate
1 parent cf98437 commit 54c01f4

File tree

4 files changed

+56
-50
lines changed

4 files changed

+56
-50
lines changed

examples_tests/22.RaytracedAO/Renderer.cpp

Lines changed: 37 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,6 @@ Renderer::Renderer(IVideoDriver* _driver, IAssetManager* _assetManager, scene::I
145145
bindings[3].type = asset::EDT_UNIFORM_TEXEL_BUFFER;
146146
bindings[4].type = asset::EDT_STORAGE_IMAGE;
147147
bindings[5].type = asset::EDT_STORAGE_BUFFER;
148-
bindings[5].count = 2u;
149148
bindings[6].type = asset::EDT_STORAGE_BUFFER;
150149

151150
m_commonRaytracingDSLayout = m_driver->createGPUDescriptorSetLayout(bindings,bindings+raytracingCommonDescriptorCount);
@@ -168,14 +167,11 @@ Renderer::Renderer(IVideoDriver* _driver, IAssetManager* _assetManager, scene::I
168167
m_raygenDSLayout = m_driver->createGPUDescriptorSetLayout(bindings,bindings+raygenDescriptorCount);
169168
}
170169
{
171-
IGPUDescriptorSetLayout::SBinding binding;
172-
binding.binding = 0;
173-
binding.type = EDT_STORAGE_BUFFER;
174-
binding.count = 2u;
175-
binding.stageFlags = ISpecializedShader::ESS_COMPUTE;
176-
binding.samplers = nullptr;
170+
constexpr auto closestHitDescriptorCount = 2u;
171+
IGPUDescriptorSetLayout::SBinding bindings[2];
172+
fillIotaDescriptorBindingDeclarations(bindings,ISpecializedShader::ESS_COMPUTE,closestHitDescriptorCount,EDT_STORAGE_BUFFER);
177173

178-
m_closestHitDSLayout = m_driver->createGPUDescriptorSetLayout(&binding,&binding+1u);
174+
m_closestHitDSLayout = m_driver->createGPUDescriptorSetLayout(bindings,bindings+closestHitDescriptorCount);
179175
}
180176
{
181177
constexpr auto resolveDescriptorCount = 3u;
@@ -655,7 +651,7 @@ core::smart_refctd_ptr<IGPUImageView> Renderer::createScreenSizedTexture(E_FORMA
655651
return m_driver->createGPUImageView(std::move(viewparams));
656652
}
657653

658-
constexpr uint16_t m_maxDepth = 3u;
654+
constexpr uint16_t m_maxDepth = 6u;
659655
constexpr uint16_t m_UNUSED_russianRouletteDepth = 5u;
660656
bool extractIntegratorInfo(const ext::MitsubaLoader::CElementIntegrator& integrator, uint32_t &bxdfSamples, uint32_t &maxNEESamples)
661657
{
@@ -780,18 +776,17 @@ void Renderer::init(const SAssetBundle& meshes, core::smart_refctd_ptr<ICPUBuffe
780776
// i know what I'm doing
781777
auto globalBackendDataDSLayout = core::smart_refctd_ptr<IGPUDescriptorSetLayout>(const_cast<IGPUDescriptorSetLayout*>(m_globalBackendDataDS->getLayout()));
782778

783-
//
784-
785779
// cull
786780
{
787781
SPushConstantRange range{ISpecializedShader::ESS_COMPUTE,0u,sizeof(CullShaderData_t)};
788782
auto _cullPipelineLayout = m_driver->createGPUPipelineLayout(&range,&range+1u,core::smart_refctd_ptr(globalBackendDataDSLayout),core::smart_refctd_ptr(m_cullDSLayout),nullptr,nullptr);
789783
m_cullPipeline = m_driver->createGPUComputePipeline(nullptr,std::move(_cullPipelineLayout),gpuSpecializedShaderFromFile(m_assetManager,m_driver,"../cull.comp"));
790784
}
791-
785+
786+
for (auto i=0u; i<2u; i++)
787+
m_commonRaytracingDS[i] = m_driver->createGPUDescriptorSet(core::smart_refctd_ptr(m_commonRaytracingDSLayout));
792788

793789
SPushConstantRange raytracingCommonPCRange{ISpecializedShader::ESS_COMPUTE,0u,sizeof(RaytraceShaderCommonData_t)};
794-
m_commonRaytracingDS = m_driver->createGPUDescriptorSet(core::smart_refctd_ptr(m_commonRaytracingDSLayout));
795790
(std::ofstream("runtime_defines.glsl")
796791
<< "#define _NBL_EXT_MITSUBA_LOADER_VT_STORAGE_VIEW_COUNT " << initData.globalMeta->m_global.getVTStorageViewCount() << "\n"
797792
<< initData.globalMeta->m_global.m_materialCompilerGLSL_declarations
@@ -823,8 +818,9 @@ void Renderer::init(const SAssetBundle& meshes, core::smart_refctd_ptr<ICPUBuffe
823818
core::smart_refctd_ptr(m_closestHitDSLayout)
824819
);
825820
m_closestHitPipeline = m_driver->createGPUComputePipeline(nullptr,std::move(_closestHitPipelineLayout),gpuSpecializedShaderFromFile(m_assetManager,m_driver,"../closestHit.comp"));
826-
827-
m_closestHitDS = m_driver->createGPUDescriptorSet(core::smart_refctd_ptr(m_closestHitDSLayout));
821+
822+
for (auto i=0u; i<2u; i++)
823+
m_closestHitDS[i] = m_driver->createGPUDescriptorSet(core::smart_refctd_ptr(m_closestHitDSLayout));
828824
}
829825
// resolve
830826
{
@@ -837,7 +833,7 @@ void Renderer::init(const SAssetBundle& meshes, core::smart_refctd_ptr<ICPUBuffe
837833

838834
//
839835
constexpr uint32_t descriptorUpdates = 5;
840-
constexpr uint32_t descriptorUpdateCounts[descriptorUpdates] = {2u,9u,2u,2u,3u};
836+
constexpr uint32_t descriptorUpdateCounts[descriptorUpdates] = {2u,7u,2u,2u,3u};
841837
constexpr uint32_t descriptorUpdateMaxCount = *std::max_element(descriptorUpdateCounts,descriptorUpdateCounts+descriptorUpdates);
842838

843839
//
@@ -943,10 +939,9 @@ void Renderer::init(const SAssetBundle& meshes, core::smart_refctd_ptr<ICPUBuffe
943939
}
944940
setImageInfo(infos+4,asset::EIL_GENERAL,core::smart_refctd_ptr(m_accumulation));
945941
createEmptyInteropBufferAndSetUpInfo(infos+5,m_rayBuffer[0],raygenBufferSize);
946-
createEmptyInteropBufferAndSetUpInfo(infos+6,m_rayBuffer[1],raygenBufferSize);
947-
setBufferInfo(infos+7,m_rayCountBuffer);
942+
setBufferInfo(infos+6,m_rayCountBuffer);
948943

949-
setDstSetAndDescTypesOnWrites(m_commonRaytracingDS.get(),writes,infos,{
944+
setDstSetAndDescTypesOnWrites(m_commonRaytracingDS[0].get(),writes,infos,{
950945
EDT_UNIFORM_BUFFER,
951946
EDT_STORAGE_BUFFER,
952947
EDT_STORAGE_IMAGE,
@@ -955,11 +950,14 @@ void Renderer::init(const SAssetBundle& meshes, core::smart_refctd_ptr<ICPUBuffe
955950
EDT_STORAGE_BUFFER,
956951
EDT_STORAGE_BUFFER
957952
});
958-
writes[5].count = 2u;
959-
writes[6].info = infos+7;
953+
m_driver->updateDescriptorSets(descriptorUpdateCounts[1],writes,0u,nullptr);
954+
// set up second DS
955+
createEmptyInteropBufferAndSetUpInfo(infos+5,m_rayBuffer[1],raygenBufferSize);
956+
for (auto i=0u; i<descriptorUpdateCounts[1]; i++)
957+
writes[i].dstSet = m_commonRaytracingDS[1].get();
958+
m_driver->updateDescriptorSets(descriptorUpdateCounts[1],writes,0u,nullptr);
960959
}
961960
initData = {}; // reclaim some memory
962-
m_driver->updateDescriptorSets(7u,writes,0u,nullptr);
963961
// set up m_raygenDS
964962
{
965963
visibilityBuffer = createScreenSizedTexture(EF_R32G32B32A32_UINT);
@@ -969,14 +967,17 @@ void Renderer::init(const SAssetBundle& meshes, core::smart_refctd_ptr<ICPUBuffe
969967
}
970968
m_driver->updateDescriptorSets(descriptorUpdateCounts[2],writes,0u,nullptr);
971969
// set up m_closestHitDS
970+
for (auto i=0u; i<2u; i++)
972971
{
973-
createEmptyInteropBufferAndSetUpInfo(infos+0,m_intersectionBuffer[0],intersectionBufferSize);
974-
createEmptyInteropBufferAndSetUpInfo(infos+1,m_intersectionBuffer[1],intersectionBufferSize);
975-
976-
setDstSetAndDescTypesOnWrites(m_closestHitDS.get(),writes,infos,{EDT_STORAGE_BUFFER,EDT_STORAGE_BUFFER});
977-
writes->count = 2u;
972+
const auto other = i^0x1u;
973+
infos[0u].desc = m_rayBuffer[other].buffer;
974+
infos[0u].buffer.offset = 0u;
975+
infos[0u].buffer.size = m_rayBuffer[other].buffer->getSize();
976+
createEmptyInteropBufferAndSetUpInfo(infos+1,m_intersectionBuffer[other],intersectionBufferSize);
977+
978+
setDstSetAndDescTypesOnWrites(m_closestHitDS[i].get(),writes,infos,{EDT_STORAGE_BUFFER,EDT_STORAGE_BUFFER});
979+
m_driver->updateDescriptorSets(descriptorUpdateCounts[3],writes,0u,nullptr);
978980
}
979-
m_driver->updateDescriptorSets(1u,writes,0u,nullptr);
980981
// set up m_resolveDS
981982
{
982983
infos[0].buffer = {0u,_staticViewDataBuffer->getSize()};
@@ -1107,9 +1108,9 @@ void Renderer::deinit()
11071108

11081109
m_raygenWorkGroups[0] = m_raygenWorkGroups[1] = 0u;
11091110
m_resolveDS = nullptr;
1110-
m_closestHitDS = nullptr;
1111+
m_closestHitDS[0] = m_closestHitDS[1] = nullptr;
11111112
m_raygenDS = nullptr;
1112-
m_commonRaytracingDS = nullptr;
1113+
m_commonRaytracingDS[0] = m_commonRaytracingDS[1] = nullptr;
11131114
m_additionalGlobalDS = nullptr;
11141115
m_rasterInstanceDataDS = nullptr;
11151116
m_globalBackendDataDS = nullptr;
@@ -1329,8 +1330,7 @@ void Renderer::render(nbl::ITimer* timer)
13291330

13301331
uint32_t Renderer::traceBounce(uint32_t raycount)
13311332
{
1332-
const uint32_t readIx = (++m_raytraceCommonData.depth)&0x1u;
1333-
const uint32_t writeIx = readIx^0x1u;
1333+
const uint32_t descSetIx = (m_raytraceCommonData.depth++)&0x1u;
13341334
if (raycount==0u)
13351335
return 0u;
13361336
// trace bounce (accumulate contributions and optionally generate rays)
@@ -1339,10 +1339,10 @@ uint32_t Renderer::traceBounce(uint32_t raycount)
13391339
const auto* pipelineLayout = (continuation ? m_closestHitPipeline:m_raygenPipeline)->getLayout();
13401340
m_driver->pushConstants(pipelineLayout,ISpecializedShader::ESS_COMPUTE,0u,sizeof(RaytraceShaderCommonData_t),&m_raytraceCommonData);
13411341

1342-
IGPUDescriptorSet* descriptorSets[4] = {m_globalBackendDataDS.get(),m_additionalGlobalDS.get(),m_commonRaytracingDS.get()};
1342+
IGPUDescriptorSet* descriptorSets[4] = {m_globalBackendDataDS.get(),m_additionalGlobalDS.get(),m_commonRaytracingDS[descSetIx].get()};
13431343
if (continuation)
13441344
{
1345-
descriptorSets[3] = m_closestHitDS.get();
1345+
descriptorSets[3] = m_closestHitDS[descSetIx].get();
13461346
m_driver->bindDescriptorSets(EPBP_COMPUTE,pipelineLayout,0u,4u,descriptorSets,nullptr);
13471347
m_driver->bindComputePipeline(m_closestHitPipeline.get());
13481348
m_driver->dispatch((raycount-1u)/WORKGROUP_SIZE+1u,1u,1u);
@@ -1369,7 +1369,7 @@ uint32_t Renderer::traceBounce(uint32_t raycount)
13691369
m_raytraceCommonData.rayCountWriteIx = (++m_raytraceCommonData.rayCountWriteIx)&RAYCOUNT_N_BUFFERING_MASK;
13701370

13711371
auto commandQueue = m_rrManager->getCLCommandQueue();
1372-
const cl_mem clObjects[] = {m_rayBuffer[writeIx].asRRBuffer.second,m_intersectionBuffer[writeIx].asRRBuffer.second};
1372+
const cl_mem clObjects[] = {m_rayBuffer[descSetIx].asRRBuffer.second,m_intersectionBuffer[descSetIx].asRRBuffer.second};
13731373
const auto objCount = sizeof(clObjects)/sizeof(cl_mem);
13741374
cl_event acquired=nullptr, raycastDone=nullptr;
13751375
// run the raytrace queries
@@ -1378,8 +1378,8 @@ uint32_t Renderer::traceBounce(uint32_t raycount)
13781378

13791379
clEnqueueWaitForEvents(commandQueue,1u,&acquired);
13801380
m_rrManager->getRadeonRaysAPI()->QueryIntersection(
1381-
m_rayBuffer[writeIx].asRRBuffer.first,nextTraceRaycount,
1382-
m_intersectionBuffer[writeIx].asRRBuffer.first,nullptr,nullptr
1381+
m_rayBuffer[descSetIx].asRRBuffer.first,nextTraceRaycount,
1382+
m_intersectionBuffer[descSetIx].asRRBuffer.first,nullptr,nullptr
13831383
);
13841384
clEnqueueMarker(commandQueue,&raycastDone);
13851385
}

examples_tests/22.RaytracedAO/Renderer.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ class Renderer : public nbl::core::IReferenceCounted, public nbl::core::Interfac
4949

5050

5151
_NBL_STATIC_INLINE_CONSTEXPR uint32_t RandomDimsPerPathVertex = 3u;
52-
_NBL_STATIC_INLINE_CONSTEXPR uint32_t MaxDimensions = RandomDimsPerPathVertex*4u;
52+
_NBL_STATIC_INLINE_CONSTEXPR uint32_t MaxDimensions = RandomDimsPerPathVertex*5u;
5353
static const float AntiAliasingSequence[4096][2];
5454
protected:
5555
~Renderer();
@@ -141,8 +141,10 @@ class Renderer : public nbl::core::IReferenceCounted, public nbl::core::Interfac
141141
nbl::core::smart_refctd_ptr<nbl::video::IGPUDescriptorSet> m_perCameraRasterDS;
142142

143143
nbl::core::smart_refctd_ptr<nbl::video::IGPUComputePipeline> m_cullPipeline,m_raygenPipeline,m_closestHitPipeline,m_resolvePipeline;
144-
nbl::core::smart_refctd_ptr<nbl::video::IGPUDescriptorSet> m_globalBackendDataDS,m_additionalGlobalDS,m_commonRaytracingDS;
145-
nbl::core::smart_refctd_ptr<nbl::video::IGPUDescriptorSet> m_rasterInstanceDataDS,m_raygenDS,m_closestHitDS,m_resolveDS;
144+
nbl::core::smart_refctd_ptr<nbl::video::IGPUDescriptorSet> m_globalBackendDataDS,m_additionalGlobalDS;
145+
nbl::core::smart_refctd_ptr<nbl::video::IGPUDescriptorSet> m_commonRaytracingDS[2];
146+
nbl::core::smart_refctd_ptr<nbl::video::IGPUDescriptorSet> m_rasterInstanceDataDS,m_raygenDS,m_resolveDS;
147+
nbl::core::smart_refctd_ptr<nbl::video::IGPUDescriptorSet> m_closestHitDS[2];
146148
uint32_t m_raygenWorkGroups[2];
147149

148150
struct InteropBuffer

examples_tests/22.RaytracedAO/closestHit.comp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,14 @@ uint get_path_vertex_depth()
1414
}
1515

1616
#include <nbl/builtin/glsl/ext/RadeonRays/intersection.glsl>
17-
layout(set = 3, binding = 0, std430) restrict buffer Queries
17+
layout(set = 3, binding = 0, std430) restrict readonly buffer SourceRays
1818
{
19-
nbl_glsl_ext_RadeonRays_Intersection data[];
20-
} intersections[2];
19+
nbl_glsl_ext_RadeonRays_ray sourceRays[];
20+
};
21+
layout(set = 3, binding = 1, std430) restrict buffer Queries
22+
{
23+
nbl_glsl_ext_RadeonRays_Intersection intersections[];
24+
};
2125

2226

2327
bool get_sample_job()
@@ -36,8 +40,8 @@ void main()
3640
// basic reads
3741
const uint vertex_depth = get_path_vertex_depth();
3842
const uint vertex_depth_mod_2 = vertex_depth&0x1u;
39-
const nbl_glsl_ext_RadeonRays_Intersection intersection = intersections[vertex_depth_mod_2].data[gl_GlobalInvocationID.x];
40-
const nbl_glsl_ext_RadeonRays_ray ray = rays[vertex_depth_mod_2].data[gl_GlobalInvocationID.x];
43+
const nbl_glsl_ext_RadeonRays_Intersection intersection = intersections[gl_GlobalInvocationID.x];
44+
const nbl_glsl_ext_RadeonRays_ray ray = sourceRays[gl_GlobalInvocationID.x];
4145

4246
const uint batchInstanceGUID = intersection.shapeid;
4347
const uint invalidID = 0x80000000u;
@@ -59,7 +63,7 @@ void main()
5963
// obtain ray incoming direction
6064
normalizedV = -ray.direction;
6165
// clear the hit success flag
62-
intersections[vertex_depth_mod_2].data[gl_GlobalInvocationID.x].shapeid = -1;
66+
intersections[gl_GlobalInvocationID.x].shapeid = -1;
6367

6468
const uvec3 indices = get_triangle_indices(batchInstanceGUID,triangleID);
6569

examples_tests/22.RaytracedAO/raytraceCommon.glsl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,10 @@ layout(set = 2, binding = 3) uniform usamplerBuffer sampleSequence;
3434
layout(set = 2, binding = 4, rg32ui) restrict uniform uimage2DArray accumulation;
3535
// ray data
3636
#include <nbl/builtin/glsl/ext/RadeonRays/ray.glsl>
37-
layout(set = 2, binding = 5, std430) restrict buffer Rays
37+
layout(set = 2, binding = 5, std430) restrict writeonly buffer SinkRays
3838
{
39-
nbl_glsl_ext_RadeonRays_ray data[];
40-
} rays[2];
39+
nbl_glsl_ext_RadeonRays_ray sinkRays[];
40+
};
4141
#include <nbl/builtin/glsl/utils/indirect_commands.glsl>
4242
layout(set = 2, binding = 6) restrict coherent buffer RayCount // maybe remove coherent keyword
4343
{
@@ -280,7 +280,7 @@ for (uint i=1u; i!=vertex_depth; i++)
280280
newRay.useless_padding[0] = packHalf2x16(nextThroughput[i].rg);
281281
newRay.useless_padding[1] = bitfieldInsert(packHalf2x16(nextThroughput[i].bb),sampleID+i,16,16);
282282
const uint outputID = baseOutputID+(offset++);
283-
rays[vertex_depth_mod_2_inv].data[outputID] = newRay;
283+
sinkRays[outputID] = newRay;
284284
}
285285
}
286286

0 commit comments

Comments
 (0)