Skip to content

Commit 73de386

Browse files
committed
Improve cluster building performance
1 parent 7329bb4 commit 73de386

File tree

5 files changed

+67
-57
lines changed

5 files changed

+67
-57
lines changed

src/Renderer/ClusteredRenderer.cpp

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,11 @@
55
#include <bx/string.h>
66
#include <glm/matrix.hpp>
77
#include <glm/gtc/type_ptr.hpp>
8+
#include <glm/ext/matrix_relational.hpp>
89

910
ClusteredRenderer::ClusteredRenderer(const Scene* scene) :
1011
Renderer(scene),
12+
oldProjMat(glm::zero<glm::mat4>()),
1113
clusterBuildingComputeProgram(BGFX_INVALID_HANDLE),
1214
lightCullingComputeProgram(BGFX_INVALID_HANDLE),
1315
lightingProgram(BGFX_INVALID_HANDLE),
@@ -54,20 +56,18 @@ void ClusteredRenderer::onRender(float dt)
5456
vLightCulling,
5557
vLighting
5658
};
57-
59+
5860
bgfx::setViewName(vClusterBuilding, "Cluster building pass (compute)");
59-
//bgfx::setViewClear(vClusterBuilding, BGFX_CLEAR_NONE);
61+
bgfx::setViewClear(vClusterBuilding, BGFX_CLEAR_NONE);
6062
// set u_viewRect for screen2Eye to work correctly
6163
bgfx::setViewRect(vClusterBuilding, 0, 0, width, height);
6264
// this could be set by a different renderer, reset it (D3D12 cares and crashes)
6365
bgfx::setViewFrameBuffer(vClusterBuilding, BGFX_INVALID_HANDLE);
64-
//bgfx::touch(vClusterBuilding);
6566

6667
bgfx::setViewName(vLightCulling, "Clustered light culling pass (compute)");
67-
//bgfx::setViewClear(vLightCulling, BGFX_CLEAR_NONE);
68+
bgfx::setViewClear(vLightCulling, BGFX_CLEAR_NONE);
6869
bgfx::setViewRect(vLightCulling, 0, 0, width, height);
6970
bgfx::setViewFrameBuffer(vLightCulling, BGFX_INVALID_HANDLE);
70-
//bgfx::touch(vLightCulling);
7171

7272
bgfx::setViewName(vLighting, "Clustered lighting pass");
7373
bgfx::setViewClear(vLighting, BGFX_CLEAR_COLOR | BGFX_CLEAR_DEPTH, clearColor, 1.0f, 0);
@@ -88,24 +88,37 @@ void ClusteredRenderer::onRender(float dt)
8888

8989
// cluster building
9090

91-
clusters.bindBuffers(false); // write access, all buffers
91+
// only run this step if the camera parameters changed (aspect ratio, fov, near/far plane)
92+
// cluster bounds are saved in camera coordinates so they don't change with camera movement
93+
94+
// ideally we'd compare the relative error here but a correct implementation would involve
95+
// a bunch of costly matrix operations: https://floating-point-gui.de/errors/comparison/
96+
// comparing the absolute error against a rather small epsilon here works as long as the values
97+
// in the projection matrix aren't getting too large
98+
bool buildClusters = glm::any(glm::notEqual(projMat, oldProjMat, 0.00001f));
99+
if(buildClusters)
100+
{
101+
oldProjMat = projMat;
102+
103+
clusters.bindBuffers(false); // write access, all buffers
92104

93-
bgfx::dispatch(vClusterBuilding,
94-
clusterBuildingComputeProgram,
95-
ClusterShader::CLUSTERS_X,
96-
ClusterShader::CLUSTERS_Y,
97-
ClusterShader::CLUSTERS_Z);
105+
bgfx::dispatch(vClusterBuilding,
106+
clusterBuildingComputeProgram,
107+
ClusterShader::CLUSTERS_X / ClusterShader::CLUSTERS_X_THREADS,
108+
ClusterShader::CLUSTERS_Y / ClusterShader::CLUSTERS_Y_THREADS,
109+
ClusterShader::CLUSTERS_Z / ClusterShader::CLUSTERS_Z_THREADS);
110+
}
98111

99112
// light culling
100113

101114
lights.bindLights(scene);
102115
clusters.bindBuffers(false); // write access, all buffers
103116

104117
bgfx::dispatch(vLightCulling,
105-
lightCullingComputeProgram,
106-
ClusterShader::CLUSTERS_X / ClusterShader::CLUSTERS_X_THREADS,
107-
ClusterShader::CLUSTERS_Y / ClusterShader::CLUSTERS_Y_THREADS,
108-
ClusterShader::CLUSTERS_Z / ClusterShader::CLUSTERS_Z_THREADS);
118+
lightCullingComputeProgram,
119+
ClusterShader::CLUSTERS_X / ClusterShader::CLUSTERS_X_THREADS,
120+
ClusterShader::CLUSTERS_Y / ClusterShader::CLUSTERS_Y_THREADS,
121+
ClusterShader::CLUSTERS_Z / ClusterShader::CLUSTERS_Z_THREADS);
109122
// lighting
110123

111124
bool debugVis = variables["DEBUG_VIS"] == "true";

src/Renderer/ClusteredRenderer.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ class ClusteredRenderer : public Renderer
1515
virtual void onShutdown() override;
1616

1717
private:
18+
glm::mat4 oldProjMat;
19+
1820
bgfx::ProgramHandle clusterBuildingComputeProgram;
1921
bgfx::ProgramHandle lightCullingComputeProgram;
2022
bgfx::ProgramHandle lightingProgram;

src/Renderer/Shaders/clusters.sh

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#define CLUSTERS_Z 24
1515

1616
// workgroup size of the culling compute shader
17+
// D3D compute shaders only allow up to 1024 threads per workgroup
1718
#define CLUSTERS_X_THREADS 16
1819
#define CLUSTERS_Y_THREADS 8
1920
#define CLUSTERS_Z_THREADS 4
@@ -37,13 +38,12 @@ uniform vec4 u_zNearFarVec;
3738
// light indices belonging to clusters
3839
CLUSTER_BUFFER(b_clusterLightIndices, uint, SAMPLER_CLUSTERS_LIGHTINDICES);
3940
// for each cluster: (start index in b_clusterLightIndices, number of point lights, empty, empty)
41+
// uint because uvec4 doesn't seem to work with D3D11
4042
CLUSTER_BUFFER(b_clusterLightGrid, uint, SAMPLER_CLUSTERS_LIGHTGRID);
41-
// uvec4 doesn't seem to work with DX11
42-
//CLUSTER_BUFFER(b_clusterLightGrid, uvec4, SAMPLER_CLUSTERS_LIGHTGRID);
4343
4444
// these are only needed for building clusters and light culling, not in the fragment shader
4545
#ifdef WRITE_CLUSTERS
46-
// list of clusters (2 vec4's each, min + max pos)
46+
// list of clusters (2 vec4's each, min + max pos for AABB)
4747
CLUSTER_BUFFER(b_clusters, vec4, SAMPLER_CLUSTERS_CLUSTERS);
4848
// atomic counter for building the light grid
4949
// must be reset to 0 every frame
@@ -60,6 +60,8 @@ struct LightGrid
6060
{
6161
uint offset;
6262
uint pointLights;
63+
// TODO
64+
//uint spotLights;
6365
};
6466

6567
#ifdef WRITE_CLUSTERS
@@ -74,7 +76,6 @@ Cluster getCluster(uint index)
7476

7577
LightGrid getLightGrid(uint cluster)
7678
{
77-
//uvec4 gridvec = b_clusterLightGrid[cluster];
7879
uvec4 gridvec = uvec4(b_clusterLightGrid[4 * cluster + 0], b_clusterLightGrid[4 * cluster + 1], 0, 0);
7980
LightGrid grid;
8081
grid.offset = gridvec.x;
@@ -90,6 +91,8 @@ uint getGridLightIndex(uint start, uint offset)
9091
// cluster depth index from depth in screen coordinates (gl_FragCoord.z)
9192
uint getClusterZIndex(float screenDepth)
9293
{
94+
// this can be calculated on the CPU and passed as a uniform
95+
// only leaving it here to keep most of the relevant code in the shaders for learning purposes
9396
float scale = float(CLUSTERS_Z) / log(u_zFar / u_zNear);
9497
float bias = -(float(CLUSTERS_Z) * log(u_zNear) / log(u_zFar / u_zNear));
9598

src/Renderer/Shaders/cs_clustered_clusterbuilding.sc

Lines changed: 12 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10,30 +10,29 @@
1010
// z-subdivision concept from http://advances.realtimerendering.com/s2016/Siggraph2016_idTech6.pdf
1111

1212
// bgfx doesn't define this in shaders
13-
#define gl_WorkGroupSize uvec3(1, 1, 1)
14-
#define gl_NumWorkGroups uvec3(CLUSTERS_X, CLUSTERS_Y, CLUSTERS_Z)
13+
#define gl_WorkGroupSize uvec3(CLUSTERS_X_THREADS, CLUSTERS_Y_THREADS, CLUSTERS_Z_THREADS)
1514

1615
// each thread handles one cluster
17-
// TODO use workgroups that fill GPU wavefronts (32 on Nvidia, 64 on AMD)
18-
NUM_THREADS(1, 1, 1)
16+
NUM_THREADS(CLUSTERS_X_THREADS, CLUSTERS_Y_THREADS, CLUSTERS_Z_THREADS)
1917
void main()
2018
{
21-
const uint clusterIndex = gl_WorkGroupID.z * gl_NumWorkGroups.x * gl_NumWorkGroups.y +
22-
gl_WorkGroupID.y * gl_NumWorkGroups.x +
23-
gl_WorkGroupID.x;
19+
// index calculation must match the inverse operation in the fragment shader (see getClusterIndex)
20+
uint clusterIndex = gl_GlobalInvocationID.z * gl_WorkGroupSize.x * gl_WorkGroupSize.y +
21+
gl_GlobalInvocationID.y * gl_WorkGroupSize.x +
22+
gl_GlobalInvocationID.x;
2423

2524
// calculate min (bottom left) and max (top right) xy in screen coordinates
26-
vec4 minScreen = vec4(gl_WorkGroupID.xy * u_clusterSizes.xy, 1.0, 1.0);
27-
vec4 maxScreen = vec4(vec2(gl_WorkGroupID.x + 1, gl_WorkGroupID.y + 1) * u_clusterSizes.xy, 1.0, 1.0);
25+
vec4 minScreen = vec4( gl_GlobalInvocationID.xy * u_clusterSizes.xy, 1.0, 1.0);
26+
vec4 maxScreen = vec4((gl_GlobalInvocationID.xy + vec2(1, 1)) * u_clusterSizes.xy, 1.0, 1.0);
2827

2928
// -> eye coordinates
3029
// z is the camera far plane (1 in screen coordinates)
3130
vec3 minEye = screen2Eye(minScreen).xyz;
3231
vec3 maxEye = screen2Eye(maxScreen).xyz;
3332

3433
// calculate near and far depth edges of the cluster
35-
float clusterNear = u_zNear * pow(u_zFar / u_zNear, gl_WorkGroupID.z / float(gl_NumWorkGroups.z));
36-
float clusterFar = u_zNear * pow(u_zFar / u_zNear, (gl_WorkGroupID.z + 1) / float(gl_NumWorkGroups.z));
34+
float clusterNear = u_zNear * pow(u_zFar / u_zNear, gl_GlobalInvocationID.z / float(CLUSTERS_Z));
35+
float clusterFar = u_zNear * pow(u_zFar / u_zNear, (gl_GlobalInvocationID.z + 1) / float(CLUSTERS_Z));
3736

3837
// this calculates the intersection between:
3938
// - a line from the camera (origin) to the eye point (at the camera's far plane)
@@ -44,17 +43,11 @@ void main()
4443
vec3 maxNear = maxEye * clusterNear / maxEye.z;
4544
vec3 maxFar = maxEye * clusterFar / maxEye.z;
4645

47-
// get max extent of the cluster in all dimensions (axis-aligned bounding box)
46+
// get extent of the cluster in all dimensions (axis-aligned bounding box)
47+
// there is some overlap here but it's easier to calculate intersections with AABB
4848
vec3 minBounds = min(min(minNear, minFar), min(maxNear, maxFar));
4949
vec3 maxBounds = max(max(minNear, minFar), max(maxNear, maxFar));
5050

5151
b_clusters[2 * clusterIndex + 0] = vec4(minBounds, 1.0);
5252
b_clusters[2 * clusterIndex + 1] = vec4(maxBounds, 1.0);
53-
54-
// reset the atomic counter for the light culling shader
55-
// writable compute buffers can't be updated by CPU so do it here
56-
if(clusterIndex == 0)
57-
{
58-
b_globalIndex[0] = 0;
59-
}
6053
}

src/Renderer/Shaders/cs_clustered_lightculling.sc

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10,20 +10,15 @@
1010
// largely inspired by http://www.aortiz.me/2018/12/21/CG.html
1111

1212
// point lights only for now
13+
bool pointLightIntersectsCluster(PointLight light, Cluster cluster);
1314

14-
bool pointLightAffectsCluster(PointLight light, Cluster cluster);
15-
float distsqToCluster(vec3 pos, Cluster cluster);
16-
17-
// bgfx doesn't define this in shaders
1815
#define gl_WorkGroupSize uvec3(CLUSTERS_X_THREADS, CLUSTERS_Y_THREADS, CLUSTERS_Z_THREADS)
1916
#define GROUP_SIZE (CLUSTERS_X_THREADS * CLUSTERS_Y_THREADS * CLUSTERS_Z_THREADS)
2017

2118
// light cache for the current work group
2219
SHARED PointLight lights[GROUP_SIZE];
2320

24-
// work group size
2521
// each thread handles one cluster
26-
// D3D compute shaders only seem to allow 1024 threads
2722
NUM_THREADS(CLUSTERS_X_THREADS, CLUSTERS_Y_THREADS, CLUSTERS_Z_THREADS)
2823
void main()
2924
{
@@ -32,11 +27,19 @@ void main()
3227
uint visibleLights[MAX_LIGHTS_PER_CLUSTER];
3328
uint visibleCount = 0;
3429

35-
// the way we calculate the index doesn't really matter here since we write to the same index as we read from the cluster buffer
36-
// it only matters that the cluster buildung and fragment shader calculate the cluster index the same way
30+
// the way we calculate the index doesn't really matter here since we write to the same index in the light grid as we read from the cluster buffer
3731
uint clusterIndex = gl_GlobalInvocationID.z * gl_WorkGroupSize.x * gl_WorkGroupSize.y +
3832
gl_GlobalInvocationID.y * gl_WorkGroupSize.x +
3933
gl_GlobalInvocationID.x;
34+
35+
// reset the atomic counter
36+
// writable compute buffers can't be updated by CPU so do it here
37+
if(clusterIndex == 0)
38+
{
39+
b_globalIndex[0] = 0;
40+
}
41+
42+
Cluster cluster = getCluster(clusterIndex);
4043

4144
// we have a cache of GROUP_SIZE lights
4245
// have to run this loop several times if we have more than GROUP_SIZE lights
@@ -89,34 +92,30 @@ void main()
8992
}
9093

9194
// write light grid for this cluster
92-
//b_clusterLightGrid[clusterIndex] = uvec4(offset, visibleCount, 0, 0);
9395
b_clusterLightGrid[4 * clusterIndex + 0] = offset;
9496
b_clusterLightGrid[4 * clusterIndex + 1] = visibleCount;
95-
//b_clusterLightGrid[4 * clusterIndex + 2] = 0; // unused, spot lights etc.
97+
// unused, spot lights etc.
98+
//b_clusterLightGrid[4 * clusterIndex + 2] = 0;
9699
//b_clusterLightGrid[4 * clusterIndex + 3] = 0;
97100
}
98101

99102
// check if light radius extends into the cluster
100-
bool pointLightAffectsCluster(PointLight light, Cluster cluster)
103+
bool pointLightIntersectsCluster(PointLight light, Cluster cluster)
101104
{
102105
// NOTE: expects light.position to be in view space like the cluster bounds
103106
// global light list has world space coordinates, but we transform the
104107
// coordinates in the shared array of lights after copying
105-
return distsqToCluster(light.position, cluster) <= (light.radius * light.radius);
106-
}
107108

108-
// squared distance of the point to planes of the bounding box
109-
float distsqToCluster(vec3 pos, Cluster cluster)
110-
{
111109
// only add distance in either dimension if it's outside the bounding box
112110

113-
vec3 belowDist = cluster.minBounds - pos;
114-
vec3 aboveDist = pos - cluster.maxBounds;
111+
vec3 belowDist = cluster.minBounds - light.position;
112+
vec3 aboveDist = light.position - cluster.maxBounds;
115113

116114
vec3 isBelow = vec3(greaterThan(belowDist, vec3_splat(0.0)));
117115
vec3 isAbove = vec3(greaterThan(aboveDist, vec3_splat(0.0)));
118116

119117
vec3 distSqVec = (isBelow * belowDist) + (isAbove * aboveDist);
120118
float distsq = dot(distSqVec, distSqVec);
121-
return distsq;
119+
120+
return distsq <= (light.radius * light.radius);
122121
}

0 commit comments

Comments
 (0)