Skip to content

Commit c6b0347

Browse files
committed
Skinning data benchmark test case 5 (subgroups)
Using our brand new spirv-cross contribution Warning: I've had to explicitely set compilation to SPIR-V 1.5 in IGLSLCompiler, i think we have to talk about possible implications Works fine, but i didnt run actual benchmark. Renderdoc crashes on it though :c
1 parent 2dab0f2 commit c6b0347

File tree

6 files changed

+187
-28
lines changed

6 files changed

+187
-28
lines changed
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
#include "common.glsl"
2+
3+
#extension GL_KHR_shader_subgroup_ballot : require
4+
5+
struct BoneData
6+
{
7+
mat4 boneMatrix;
8+
mat4x3 normalMatrix;
9+
};
10+
11+
layout(std430, set = 0, binding = 0, row_major) readonly buffer BoneMatrices_struct
12+
{
13+
BoneData data[];
14+
} boneSSBO_structs;
15+
layout(std430, set = 0, binding = 1) readonly buffer BoneMatrices_dword
16+
{
17+
uint data[];
18+
} boneSSBO_dwords;
19+
20+
#ifndef BENCHMARK
21+
layout(location = 0) in vec3 pos;
22+
layout(location = 3) in vec3 normal;
23+
layout(location = 0) out vec3 vNormal;
24+
#endif
25+
layout(location = 4) in uint boneID;
26+
27+
#define OBJ_DWORDS 32 // sizeof(BoneData), must be PoT
28+
struct BoneData_dword
29+
{
30+
uint data[OBJ_DWORDS];
31+
};
32+
BoneData toBoneData(in BoneData_dword bone)
33+
{
34+
BoneData retval;
35+
//tpose because it was loaded as row_major
36+
//one-liner because glslang doesnt support multiline preproc definitions
37+
#define GET_BONE_MATRIX_COL(c) retval.boneMatrix[c].x = uintBitsToFloat(bone.data[c]);retval.boneMatrix[c].y = uintBitsToFloat(bone.data[c+4]);retval.boneMatrix[c].z = uintBitsToFloat(bone.data[c+8]);retval.boneMatrix[c].w = uintBitsToFloat(bone.data[c+12])
38+
39+
GET_BONE_MATRIX_COL(0);
40+
GET_BONE_MATRIX_COL(1);
41+
GET_BONE_MATRIX_COL(2);
42+
GET_BONE_MATRIX_COL(3);
43+
44+
//tpose because it was loaded as row_major
45+
//one-liner because glslang doesnt support multiline preproc definitions
46+
#define GET_NORMAL_MATRIX_COL(c) retval.normalMatrix[c].x = uintBitsToFloat(bone.data[16+c]);retval.normalMatrix[c].y = uintBitsToFloat(bone.data[16+c+4]);retval.normalMatrix[c].z = uintBitsToFloat(bone.data[16+c+8])
47+
48+
GET_NORMAL_MATRIX_COL(0);
49+
GET_NORMAL_MATRIX_COL(1);
50+
GET_NORMAL_MATRIX_COL(2);
51+
GET_NORMAL_MATRIX_COL(3);
52+
53+
return retval;
54+
}
55+
#define COALESCING_DWORDS_LOG2 4 // GCN can fetch only 64bytes in a single request
56+
#define SUBGROUP_THRESH 16
57+
58+
BoneData getBone(uint _boneID)
59+
{
60+
//#ifdef IRR_GL_KHR_shader_subgroup_basic_size
61+
// if a set of invocations are active without gaps we can do a fast path
62+
const uvec4 activeMask = subgroupBallot(true);
63+
const int incr = int(subgroupBallotBitCount(activeMask));
64+
const int incrLog2 = int(subgroupBallotFindMSB(activeMask));
65+
if ((0x1<<incrLog2)==incr && incrLog2>=COALESCING_DWORDS_LOG2) // contiguous segment of active warps is required
66+
{
67+
BoneData_dword retval;
68+
uint boneID = _boneID*uint(OBJ_DWORDS);
69+
70+
// basically fetch bones for one target invocation at a time
71+
uvec2 outstandingLoadsMask = activeMask.xy;
72+
// maybe unroll a few times manually
73+
while (any(notEqual(outstandingLoadsMask,uvec2(0u))))
74+
{
75+
// more work required to make this work with gl_SubgroupSize > OBJ_DWORDS but good enough to benchmark
76+
uint subgroupBoneID = subgroupBroadcast(boneID,subgroupBallotFindLSB(uvec4(outstandingLoadsMask,0u,0u)));
77+
bool willLoadBone = subgroupBoneID==boneID;
78+
outstandingLoadsMask ^= subgroupBallot(willLoadBone).xy;
79+
80+
81+
uint dynamically_uniform_addr = boneID+gl_SubgroupInvocationID;
82+
// use all SIMD lanes to load but then only some to read from subgroup registers
83+
uint tmp = boneSSBO_dwords.data[dynamically_uniform_addr];
84+
const bool notEnoughInvocations = incrLog2<OBJ_DWORDS;
85+
86+
if (willLoadBone)
87+
{
88+
int oit=0, iit=0;
89+
for (int j=0; j<SUBGROUP_THRESH; j++)
90+
retval.data[oit++] = subgroupBroadcast(tmp,iit++);
91+
}
92+
if (notEnoughInvocations)
93+
{
94+
tmp = boneSSBO_dwords.data[dynamically_uniform_addr+incr];
95+
}
96+
if (willLoadBone)
97+
{
98+
int oit=SUBGROUP_THRESH, iit=notEnoughInvocations ? SUBGROUP_THRESH:0;
99+
for (int j=0; j<SUBGROUP_THRESH; j++)
100+
retval.data[oit++] = subgroupBroadcast(tmp,iit++);
101+
}
102+
}
103+
104+
return toBoneData(retval);
105+
}
106+
else
107+
//#endif
108+
return boneSSBO_structs.data[_boneID];
109+
}
110+
111+
void main()
112+
{
113+
#ifdef BENCHMARK
114+
const vec3 pos = vec3(1.0, 2.0, 3.0);
115+
const vec3 normal = vec3(1.0, 2.0, 3.0);
116+
#endif
117+
BoneData bone = getBone(boneID);
118+
#ifndef BENCHMARK
119+
gl_Position = bone.boneMatrix * vec4(pos, 1.0);
120+
vNormal = mat3(bone.normalMatrix) * normalize(normal);
121+
#else
122+
gl_Position = bone.boneMatrix * vec4(pos, 1.0);
123+
gl_Position.xyz += mat3(bone.normalMatrix) * normal;
124+
#endif
125+
126+
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#version 460 core
2+
#define BENCHMARK
3+
#include "5.vert"
Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
1+
#ifdef __cplusplus
2+
#define uint uint32_t
3+
#endif
14
#define MAX_OBJ_CNT 3000
25
#define MAX_BONE_CNT 37
3-
#define MAT_MAX_CNT (MAX_OBJ_CNT * MAX_BONE_CNT)
4-
#define BONE_VEC_MAX_CNT (MAT_MAX_CNT * 4)
5-
#define NORM_VEC_MAX_CNT (MAT_MAX_CNT * 3)
6-
#define BONE_COMP_MAX_CNT (MAT_MAX_CNT * 16)
7-
#define NORM_COMP_MAX_CNT (MAT_MAX_CNT * 9)
6+
#define MAT_MAX_CNT uint (MAX_OBJ_CNT * MAX_BONE_CNT)
7+
#define BONE_VEC_MAX_CNT uint (MAT_MAX_CNT * 4)
8+
#define NORM_VEC_MAX_CNT uint (MAT_MAX_CNT * 3)
9+
#define BONE_COMP_MAX_CNT uint (MAT_MAX_CNT * 16)
10+
#define NORM_COMP_MAX_CNT uint (MAT_MAX_CNT * 9)

examples_tests/31.SkinningDataBenchmark/main.cpp

Lines changed: 42 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,9 @@ IFrameBuffer* createDepthOnlyFrameBuffer(video::IVideoDriver* driver)
7979
return frameBuffer;
8080
}
8181

82+
constexpr uint32_t TEST_CASE_COUNT = 5u;
83+
constexpr uint32_t TEST_CASE_SUBGROUPS = 4u;
84+
8285
int main()
8386
{
8487
// create device with full flexibility over creation parameters
@@ -117,14 +120,16 @@ int main()
117120
auto vertexShaderBundle_2 = am->getAsset("../test_2.vert", lp);
118121
auto vertexShaderBundle_3 = am->getAsset("../test_3.vert", lp);
119122
auto vertexShaderBundle_4 = am->getAsset("../test_4.vert", lp);
123+
auto vertexShaderBundle_5 = am->getAsset("../test_5.vert", lp);
120124
#else
121125
auto vertexShaderBundle_1 = am->getAsset("../benchmark_1.vert", lp);
122126
auto vertexShaderBundle_2 = am->getAsset("../benchmark_2.vert", lp);
123127
auto vertexShaderBundle_3 = am->getAsset("../benchmark_3.vert", lp);
124128
auto vertexShaderBundle_4 = am->getAsset("../benchmark_4.vert", lp);
129+
auto vertexShaderBundle_5 = am->getAsset("../benchmark_5.vert", lp);
125130
#endif
126131
auto fragShaderBundle = am->getAsset("../dirLight.frag", lp);
127-
ICPUSpecializedShader* shaders[4][2];
132+
ICPUSpecializedShader* shaders[TEST_CASE_COUNT][2];
128133
shaders[0][0] = IAsset::castDown<ICPUSpecializedShader>(vertexShaderBundle_1.getContents().begin()->get());
129134
shaders[0][1] = IAsset::castDown<ICPUSpecializedShader>(fragShaderBundle.getContents().begin()->get());
130135
shaders[1][0] = IAsset::castDown<ICPUSpecializedShader>(vertexShaderBundle_2.getContents().begin()->get());
@@ -133,6 +138,8 @@ int main()
133138
shaders[2][1] = IAsset::castDown<ICPUSpecializedShader>(fragShaderBundle.getContents().begin()->get());
134139
shaders[3][0] = IAsset::castDown<ICPUSpecializedShader>(vertexShaderBundle_4.getContents().begin()->get());
135140
shaders[3][1] = IAsset::castDown<ICPUSpecializedShader>(fragShaderBundle.getContents().begin()->get());
141+
shaders[4][0] = IAsset::castDown<ICPUSpecializedShader>(vertexShaderBundle_5.getContents().begin()->get());
142+
shaders[4][1] = IAsset::castDown<ICPUSpecializedShader>(fragShaderBundle.getContents().begin()->get());
136143

137144
core::vector<uint16_t> boneMatMaxCnt;
138145

@@ -320,7 +327,7 @@ int main()
320327
core::matrix4SIMD boneMatrix;
321328
core::matrix3x4SIMD normalMatrix;
322329
};
323-
core::smart_refctd_ptr<IGPUBuffer> drawDataBuffer[4];
330+
core::smart_refctd_ptr<IGPUBuffer> drawDataBuffer[TEST_CASE_COUNT];
324331
vector<core::matrix3x4SIMD> translationMatrices_2(diskCount);
325332
core::vector<core::matrix4SIMD> boneMatrices(boneMatrixCnt);
326333
core::vector<core::matrix3x4SIMD> normalMatrices(boneMatrixCnt);
@@ -350,6 +357,8 @@ int main()
350357

351358
//as floats
352359
drawDataBuffer[3] = driver->createDeviceLocalGPUBufferOnDedMem((BONE_COMP_MAX_CNT + NORM_COMP_MAX_CNT) * sizeof(float));
360+
361+
drawDataBuffer[TEST_CASE_SUBGROUPS] = drawDataBuffer[0];
353362
}
354363

355364

@@ -367,9 +376,10 @@ int main()
367376
uint32_t matrixOffsets[16];
368377
};
369378

370-
core::smart_refctd_ptr<IGPUPipelineLayout> gpuPipelineLayout[4];
371-
core::smart_refctd_ptr<IGPURenderpassIndependentPipeline> gpuPipeline[4];
372-
core::smart_refctd_ptr<IGPUDescriptorSet> descriptorSet[4];
379+
//TODO
380+
core::smart_refctd_ptr<IGPUPipelineLayout> gpuPipelineLayout[TEST_CASE_COUNT];
381+
core::smart_refctd_ptr<IGPURenderpassIndependentPipeline> gpuPipeline[TEST_CASE_COUNT];
382+
core::smart_refctd_ptr<IGPUDescriptorSet> descriptorSet[TEST_CASE_COUNT];
373383

374384
Shader3PushConstants s3pc;
375385
s3pc.matrixOffsets = core::vector4du32_SIMD(0u, boneMatrixCnt, boneMatrixCnt * 2, boneMatrixCnt * 3);
@@ -379,42 +389,50 @@ int main()
379389
s4pc.matrixOffsets[i] = i * boneMatrixCnt;
380390

381391
{
382-
asset::SPushConstantRange range[4] = {
392+
asset::SPushConstantRange range[TEST_CASE_COUNT] = {
383393
asset::ISpecializedShader::ESS_UNKNOWN, 0u, 0u,
384394
asset::ISpecializedShader::ESS_UNKNOWN, 0u, 0u,
385395
asset::ISpecializedShader::ESS_VERTEX, 0u, sizeof(Shader3PushConstants),
386-
asset::ISpecializedShader::ESS_VERTEX, 0u, sizeof(Shader4PushConstants)
396+
asset::ISpecializedShader::ESS_VERTEX, 0u, sizeof(Shader4PushConstants),
397+
asset::ISpecializedShader::ESS_UNKNOWN, 0u, 0u
387398
};
388399

389-
for (uint32_t i = 0u; i < 4u; i++)
400+
//TODO
401+
for (uint32_t i = 0u; i < TEST_CASE_COUNT; i++)
390402
{
391403
core::smart_refctd_ptr<IGPUDescriptorSetLayout> layout;
392404
{
393-
video::IGPUDescriptorSetLayout::SBinding b[1];
405+
video::IGPUDescriptorSetLayout::SBinding b[2];
394406
b[0].binding = 0u;
395407
b[0].count = 1u;
396408
b[0].type = EDT_STORAGE_BUFFER;
409+
b[1] = b[0];
410+
b[1].binding = 1u;
397411

398-
layout = driver->createGPUDescriptorSetLayout(b, b + 1);
412+
uint32_t count = i == TEST_CASE_SUBGROUPS ? 2u : 1u;
413+
layout = driver->createGPUDescriptorSetLayout(b, b + count);
399414
}
400415

401416
descriptorSet[i] = driver->createGPUDescriptorSet(core::smart_refctd_ptr(layout));
402417
{
403-
video::IGPUDescriptorSet::SWriteDescriptorSet w;
404-
w.binding = 0u;
405-
w.arrayElement = 0u;
406-
w.count = 1u;
407-
w.descriptorType = EDT_STORAGE_BUFFER;
408-
w.dstSet = descriptorSet[i].get();
418+
video::IGPUDescriptorSet::SWriteDescriptorSet w[2];
419+
w[0].binding = 0u;
420+
w[0].arrayElement = 0u;
421+
w[0].count = 1u;
422+
w[0].descriptorType = EDT_STORAGE_BUFFER;
423+
w[0].dstSet = descriptorSet[i].get();
424+
w[1] = w[0];
409425

410426
video::IGPUDescriptorSet::SDescriptorInfo info;
411427
info.buffer.offset = 0u;
412428
info.buffer.size = drawDataBuffer[i]->getSize();
413429
info.desc = drawDataBuffer[i];
414430

415-
w.info = &info;
431+
w[0].info = &info;
432+
w[1].info = &info;
416433

417-
driver->updateDescriptorSets(1u, &w, 0u, nullptr);
434+
uint32_t count = i == TEST_CASE_SUBGROUPS ? 2u : 1u;
435+
driver->updateDescriptorSets(count, w, 0u, nullptr);
418436
}
419437

420438
auto gpuShaders = driver->getGPUObjectsFromAssets(shaders[i], shaders[i] + 2);
@@ -520,8 +538,9 @@ int main()
520538
{
521539
switch (caseID)
522540
{
523-
case 0:
524-
case 1:
541+
case 0: [[fallthrough]];
542+
case 1: [[fallthrough]];
543+
case TEST_CASE_SUBGROUPS:
525544
break;
526545
case 2:
527546
driver->pushConstants(gpuPipelineLayout[2].get(), asset::ISpecializedShader::ESS_VERTEX, 0u, sizeof(Shader3PushConstants), &s3pc);
@@ -538,7 +557,8 @@ int main()
538557
{
539558
switch (caseID)
540559
{
541-
case 0:
560+
case 0: [[fallthrough]];
561+
case TEST_CASE_SUBGROUPS:
542562
{
543563
const size_t matricesByteSize = sizeof(BoneNormalMatPair) * boneAndNormalMatrices.size();
544564

@@ -591,7 +611,7 @@ int main()
591611

592612
constexpr uint32_t iterationCnt = 1000u;
593613
constexpr uint32_t warmupIterationCnt = iterationCnt / 10u;
594-
for (uint32_t caseID = 0u; caseID < 4u; caseID++)
614+
for (uint32_t caseID = 0u; caseID < TEST_CASE_COUNT; caseID++)
595615
{
596616
os::Printer::print(std::string("Benchmark for case nr. " + std::to_string(caseID)));
597617

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#version 460 core
2+
#include "5.vert"
3+

src/irr/asset/IGLSLCompiler.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ namespace irr
2121
namespace asset
2222
{
2323

24+
static constexpr shaderc_spirv_version TARGET_SPIRV_VERSION = shaderc_spirv_version_1_5;
25+
2426
IGLSLCompiler::IGLSLCompiler(io::IFileSystem* _fs) : m_inclHandler(core::make_smart_refctd_ptr<CIncludeHandler>(_fs)), m_fs(_fs)
2527
{
2628
//m_inclHandler->addBuiltinIncludeLoader(core::make_smart_refctd_ptr<asset::CGLSLScanBuiltinIncludeLoader>());
@@ -37,6 +39,7 @@ core::smart_refctd_ptr<ICPUBuffer> IGLSLCompiler::compileSPIRVFromGLSL(const cha
3739

3840
shaderc::Compiler comp;
3941
shaderc::CompileOptions options;//default options
42+
options.SetTargetSpirv(TARGET_SPIRV_VERSION);
4043
const shaderc_shader_kind stage = _stage==ISpecializedShader::ESS_UNKNOWN ? shaderc_glsl_infer_from_source : ESStoShadercEnum(_stage);
4144
const size_t glsl_len = strlen(_glslCode);
4245
if (_genDebugInfo)
@@ -222,7 +225,8 @@ core::smart_refctd_ptr<ICPUShader> IGLSLCompiler::resolveIncludeDirectives(std::
222225
{
223226
impl::disableAllDirectivesExceptIncludes(glslCode);//all "#", except those in "#include"/"#version"/"#pragma shader_stage(...)", replaced with `PREPROC_DIRECTIVE_DISABLER`
224227
shaderc::Compiler comp;
225-
shaderc::CompileOptions options;//default options
228+
shaderc::CompileOptions options;
229+
options.SetTargetSpirv(TARGET_SPIRV_VERSION);
226230
options.SetIncluder(std::make_unique<impl::Includer>(m_inclHandler.get(), m_fs, _maxSelfInclusionCnt+1u));//custom #include handler
227231
const shaderc_shader_kind stage = _stage==ISpecializedShader::ESS_UNKNOWN ? shaderc_glsl_infer_from_source : ESStoShadercEnum(_stage);
228232
auto res = comp.PreprocessGlsl(glslCode, stage, _originFilepath, options);

0 commit comments

Comments
 (0)