Skip to content

Commit cf4fdf9

Browse files
Remove shader variants (#1232)
1 parent da8ee80 commit cf4fdf9

File tree

3 files changed

+172
-69
lines changed

3 files changed

+172
-69
lines changed

samples/performance/16bit_arithmetic/16bit_arithmetic.cpp

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -156,14 +156,18 @@ bool KHR16BitArithmeticSample::prepare(const vkb::ApplicationOptions &options)
156156
vkb::ShaderVariant variant;
157157
if (supports_push_constant16)
158158
{
159-
variant.add_define("PUSH_CONSTANT_16");
159+
auto &module_fp16 =
160+
device.get_resource_cache().request_shader_module(VK_SHADER_STAGE_COMPUTE_BIT,
161+
vkb::ShaderSource{"16bit_arithmetic/compute_buffer_fp16.comp"}, variant);
162+
compute_layout_fp16 = &device.get_resource_cache().request_pipeline_layout({&module_fp16});
163+
}
164+
else
165+
{
166+
auto &module_fp16 =
167+
device.get_resource_cache().request_shader_module(VK_SHADER_STAGE_COMPUTE_BIT,
168+
vkb::ShaderSource{"16bit_arithmetic/compute_buffer_fp16_fallback.comp"}, variant);
169+
compute_layout_fp16 = &device.get_resource_cache().request_pipeline_layout({&module_fp16});
160170
}
161-
162-
const char *shader = "16bit_arithmetic/compute_buffer_fp16.comp";
163-
auto &module_fp16 =
164-
device.get_resource_cache().request_shader_module(VK_SHADER_STAGE_COMPUTE_BIT,
165-
vkb::ShaderSource{shader}, variant);
166-
compute_layout_fp16 = &device.get_resource_cache().request_pipeline_layout({&module_fp16});
167171
}
168172
else
169173
{
Lines changed: 55 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#version 450
2-
/* Copyright (c) 2020-2021, Arm Limited and Contributors
2+
/* Copyright (c) 2020-2024, Arm Limited and Contributors
33
*
44
* SPDX-License-Identifier: Apache-2.0
55
*
@@ -24,89 +24,82 @@
2424

2525
layout(local_size_x = 8, local_size_y = 8) in;
2626

27-
layout(constant_id = 0) const uint WIDTH = 1;
27+
layout(constant_id = 0) const uint WIDTH = 1;
2828
layout(constant_id = 1) const uint HEIGHT = 1;
2929

3030
layout(set = 0, binding = 0) readonly buffer SSBO
3131
{
32-
// It is possible to use native 16-bit types in SSBOs and UBOs. We could use uvec2 here and unpack manually.
33-
// The key feature of 16-bit storage is to allow scalar access to 16-bit values however.
34-
// Avoiding extra unpacking and packing can also be useful.
35-
f16vec4 blob_data[];
32+
// It is possible to use native 16-bit types in SSBOs and UBOs. We could use uvec2 here and unpack manually.
33+
// The key feature of 16-bit storage is to allow scalar access to 16-bit values however.
34+
// Avoiding extra unpacking and packing can also be useful.
35+
f16vec4 blob_data[];
3636
};
3737

3838
layout(rgba16f, set = 0, binding = 1) writeonly uniform mediump image2D o_results;
3939

4040
layout(push_constant) uniform Registers
4141
{
42-
// Push constants can also be 16-bit. This can also be very useful since push constant space is so limited!
43-
#ifdef PUSH_CONSTANT_16
44-
uint16_t num_blobs;
45-
float16_t seed;
46-
i16vec2 range;
47-
#else
48-
// Fallback for implementations which do not support PushConstant16.
49-
uint num_blobs;
50-
float seed;
51-
ivec2 range;
52-
#endif
53-
} registers;
42+
uint16_t num_blobs;
43+
float16_t seed;
44+
i16vec2 range;
45+
}
46+
registers;
5447

5548
// This is very arbitrary. Expends a ton of arithmetic to compute
5649
// something that looks similar to a lens flare.
5750
f16vec4 compute_blob(f16vec2 pos, f16vec4 blob, float16_t seed)
5851
{
59-
f16vec2 offset = pos - blob.xy;
60-
f16vec4 rg_offset = offset.xxyy * f16vec4(0.95hf, 1.0hf, 0.95hf, 1.0hf);
61-
f16vec4 bs_offset = offset.xxyy * f16vec4(1.05hf, 1.1hf + seed, 1.05hf, 1.1hf + seed);
62-
63-
f16vec4 rg_dot = rg_offset * rg_offset;
64-
f16vec4 bs_dot = bs_offset * bs_offset;
65-
66-
// Dot products can be somewhat awkward in FP16, since the result is a scalar 16-bit value, and we don't want that.
67-
// To that end, we compute at least two dot products side by side, and rg_offset and bs_offset are swizzled
68-
// such that we avoid swizzling across a 32-bit boundary.
69-
f16vec4 dots = f16vec4(rg_dot.xy + rg_dot.zw, bs_dot.xy + bs_dot.zw) * blob.w;
70-
71-
// Now we have square distances to blob center.
72-
73-
// Gotta have some FMAs, right? :D
74-
dots = dots * dots + dots;
75-
dots = dots * dots + dots;
76-
dots = dots * dots + dots;
77-
dots = dots * dots + dots;
78-
dots = dots * dots + dots;
79-
dots = dots * dots + dots;
80-
81-
f16vec4 parabolas = max(f16vec4(1.0hf, 1.0hf, 1.0hf, 0.9hf) - dots, f16vec4(0.0hf));
82-
parabolas -= parabolas.w;
83-
parabolas = max(parabolas, f16vec4(0.0hf));
84-
return parabolas;
52+
f16vec2 offset = pos - blob.xy;
53+
f16vec4 rg_offset = offset.xxyy * f16vec4(0.95hf, 1.0hf, 0.95hf, 1.0hf);
54+
f16vec4 bs_offset = offset.xxyy * f16vec4(1.05hf, 1.1hf + seed, 1.05hf, 1.1hf + seed);
55+
56+
f16vec4 rg_dot = rg_offset * rg_offset;
57+
f16vec4 bs_dot = bs_offset * bs_offset;
58+
59+
// Dot products can be somewhat awkward in FP16, since the result is a scalar 16-bit value, and we don't want that.
60+
// To that end, we compute at least two dot products side by side, and rg_offset and bs_offset are swizzled
61+
// such that we avoid swizzling across a 32-bit boundary.
62+
f16vec4 dots = f16vec4(rg_dot.xy + rg_dot.zw, bs_dot.xy + bs_dot.zw) * blob.w;
63+
64+
// Now we have square distances to blob center.
65+
66+
// Gotta have some FMAs, right? :D
67+
dots = dots * dots + dots;
68+
dots = dots * dots + dots;
69+
dots = dots * dots + dots;
70+
dots = dots * dots + dots;
71+
dots = dots * dots + dots;
72+
dots = dots * dots + dots;
73+
74+
f16vec4 parabolas = max(f16vec4(1.0hf, 1.0hf, 1.0hf, 0.9hf) - dots, f16vec4(0.0hf));
75+
parabolas -= parabolas.w;
76+
parabolas = max(parabolas, f16vec4(0.0hf));
77+
return parabolas;
8578
}
8679

8780
void main()
8881
{
89-
uint num_blobs = uint(registers.num_blobs);
82+
uint num_blobs = uint(registers.num_blobs);
9083

91-
float x = float(gl_GlobalInvocationID.x) / float(WIDTH) - 0.5;
92-
float y = float(gl_GlobalInvocationID.y) / float(HEIGHT) - 0.5;
93-
f16vec2 pos = f16vec2(x, y);
94-
f16vec4 result = f16vec4(0.0hf);
95-
float16_t seed = float16_t(registers.seed);
96-
ivec2 range = ivec2(registers.range);
84+
float x = float(gl_GlobalInvocationID.x) / float(WIDTH) - 0.5;
85+
float y = float(gl_GlobalInvocationID.y) / float(HEIGHT) - 0.5;
86+
f16vec2 pos = f16vec2(x, y);
87+
f16vec4 result = f16vec4(0.0hf);
88+
float16_t seed = float16_t(registers.seed);
89+
ivec2 range = ivec2(registers.range);
9790

98-
const float16_t EXPAND_FACTOR = 0.3hf;
99-
float16_t stride = seed * EXPAND_FACTOR;
91+
const float16_t EXPAND_FACTOR = 0.3hf;
92+
float16_t stride = seed * EXPAND_FACTOR;
10093

101-
for (uint i = 0; i < num_blobs; i++)
102-
{
103-
f16vec4 blob = blob_data[i];
94+
for (uint i = 0; i < num_blobs; i++)
95+
{
96+
f16vec4 blob = blob_data[i];
10497

105-
// Get as much mileage out of the buffer load as possible.
106-
for (int y = -range.y; y <= range.y; y++)
107-
for (int x = -range.x; x <= range.x; x++)
108-
result += compute_blob(pos + stride * f16vec2(x, y), blob, seed);
109-
}
98+
// Get as much mileage out of the buffer load as possible.
99+
for (int y = -range.y; y <= range.y; y++)
100+
for (int x = -range.x; x <= range.x; x++)
101+
result += compute_blob(pos + stride * f16vec2(x, y), blob, seed);
102+
}
110103

111-
imageStore(o_results, ivec2(gl_GlobalInvocationID.xy), result);
104+
imageStore(o_results, ivec2(gl_GlobalInvocationID.xy), result);
112105
}
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
#version 450
2+
/* Copyright (c) 2020-2024, Arm Limited and Contributors
3+
*
4+
* SPDX-License-Identifier: Apache-2.0
5+
*
6+
* Licensed under the Apache License, Version 2.0 the "License";
7+
* you may not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
// Allows us to use float16_t for arithmetic purposes.
20+
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
21+
22+
// Allows us to use int16_t, uint16_t and float16_t for buffers.
23+
#extension GL_EXT_shader_16bit_storage : require
24+
25+
layout(local_size_x = 8, local_size_y = 8) in;
26+
27+
layout(constant_id = 0) const uint WIDTH = 1;
28+
layout(constant_id = 1) const uint HEIGHT = 1;
29+
30+
layout(set = 0, binding = 0) readonly buffer SSBO
31+
{
32+
// It is possible to use native 16-bit types in SSBOs and UBOs. We could use uvec2 here and unpack manually.
33+
// The key feature of 16-bit storage is to allow scalar access to 16-bit values however.
34+
// Avoiding extra unpacking and packing can also be useful.
35+
f16vec4 blob_data[];
36+
};
37+
38+
layout(rgba16f, set = 0, binding = 1) writeonly uniform mediump image2D o_results;
39+
40+
layout(push_constant) uniform Registers
41+
{
42+
// Fallback for implementations which do not support PushConstant16.
43+
uint num_blobs;
44+
float seed;
45+
ivec2 range;
46+
}
47+
registers;
48+
49+
// This is very arbitrary. Expends a ton of arithmetic to compute
50+
// something that looks similar to a lens flare.
51+
f16vec4 compute_blob(f16vec2 pos, f16vec4 blob, float16_t seed)
52+
{
53+
f16vec2 offset = pos - blob.xy;
54+
f16vec4 rg_offset = offset.xxyy * f16vec4(0.95hf, 1.0hf, 0.95hf, 1.0hf);
55+
f16vec4 bs_offset = offset.xxyy * f16vec4(1.05hf, 1.1hf + seed, 1.05hf, 1.1hf + seed);
56+
57+
f16vec4 rg_dot = rg_offset * rg_offset;
58+
f16vec4 bs_dot = bs_offset * bs_offset;
59+
60+
// Dot products can be somewhat awkward in FP16, since the result is a scalar 16-bit value, and we don't want that.
61+
// To that end, we compute at least two dot products side by side, and rg_offset and bs_offset are swizzled
62+
// such that we avoid swizzling across a 32-bit boundary.
63+
f16vec4 dots = f16vec4(rg_dot.xy + rg_dot.zw, bs_dot.xy + bs_dot.zw) * blob.w;
64+
65+
// Now we have square distances to blob center.
66+
67+
// Gotta have some FMAs, right? :D
68+
dots = dots * dots + dots;
69+
dots = dots * dots + dots;
70+
dots = dots * dots + dots;
71+
dots = dots * dots + dots;
72+
dots = dots * dots + dots;
73+
dots = dots * dots + dots;
74+
75+
f16vec4 parabolas = max(f16vec4(1.0hf, 1.0hf, 1.0hf, 0.9hf) - dots, f16vec4(0.0hf));
76+
parabolas -= parabolas.w;
77+
parabolas = max(parabolas, f16vec4(0.0hf));
78+
return parabolas;
79+
}
80+
81+
void main()
82+
{
83+
uint num_blobs = uint(registers.num_blobs);
84+
85+
float x = float(gl_GlobalInvocationID.x) / float(WIDTH) - 0.5;
86+
float y = float(gl_GlobalInvocationID.y) / float(HEIGHT) - 0.5;
87+
f16vec2 pos = f16vec2(x, y);
88+
f16vec4 result = f16vec4(0.0hf);
89+
float16_t seed = float16_t(registers.seed);
90+
ivec2 range = ivec2(registers.range);
91+
92+
const float16_t EXPAND_FACTOR = 0.3hf;
93+
float16_t stride = seed * EXPAND_FACTOR;
94+
95+
for (uint i = 0; i < num_blobs; i++)
96+
{
97+
f16vec4 blob = blob_data[i];
98+
99+
// Get as much mileage out of the buffer load as possible.
100+
for (int y = -range.y; y <= range.y; y++)
101+
for (int x = -range.x; x <= range.x; x++)
102+
result += compute_blob(pos + stride * f16vec2(x, y), blob, seed);
103+
}
104+
105+
imageStore(o_results, ivec2(gl_GlobalInvocationID.xy), result);
106+
}

0 commit comments

Comments
 (0)