|
1 | 1 | #version 450 |
2 | | -/* Copyright (c) 2020-2021, Arm Limited and Contributors |
| 2 | +/* Copyright (c) 2020-2024, Arm Limited and Contributors |
3 | 3 | * |
4 | 4 | * SPDX-License-Identifier: Apache-2.0 |
5 | 5 | * |
|
24 | 24 |
|
25 | 25 | layout(local_size_x = 8, local_size_y = 8) in; |
26 | 26 |
|
27 | | -layout(constant_id = 0) const uint WIDTH = 1; |
| 27 | +layout(constant_id = 0) const uint WIDTH = 1; |
28 | 28 | layout(constant_id = 1) const uint HEIGHT = 1; |
29 | 29 |
|
30 | 30 | layout(set = 0, binding = 0) readonly buffer SSBO |
31 | 31 | { |
32 | | - // It is possible to use native 16-bit types in SSBOs and UBOs. We could use uvec2 here and unpack manually. |
33 | | - // The key feature of 16-bit storage is to allow scalar access to 16-bit values however. |
34 | | - // Avoiding extra unpacking and packing can also be useful. |
35 | | - f16vec4 blob_data[]; |
| 32 | + // It is possible to use native 16-bit types in SSBOs and UBOs. We could use uvec2 here and unpack manually. |
| 33 | + // The key feature of 16-bit storage is to allow scalar access to 16-bit values however. |
| 34 | + // Avoiding extra unpacking and packing can also be useful. |
| 35 | + f16vec4 blob_data[]; |
36 | 36 | }; |
37 | 37 |
|
38 | 38 | layout(rgba16f, set = 0, binding = 1) writeonly uniform mediump image2D o_results; |
39 | 39 |
|
40 | 40 | layout(push_constant) uniform Registers |
41 | 41 | { |
42 | | - // Push constants can also be 16-bit. This can also be very useful since push constant space is so limited! |
43 | | -#ifdef PUSH_CONSTANT_16 |
44 | | - uint16_t num_blobs; |
45 | | - float16_t seed; |
46 | | - i16vec2 range; |
47 | | -#else |
48 | | - // Fallback for implementations which do not support PushConstant16. |
49 | | - uint num_blobs; |
50 | | - float seed; |
51 | | - ivec2 range; |
52 | | -#endif |
53 | | -} registers; |
| 42 | + uint16_t num_blobs; |
| 43 | + float16_t seed; |
| 44 | + i16vec2 range; |
| 45 | +} |
| 46 | +registers; |
54 | 47 |
|
55 | 48 | // This is very arbitrary. Expends a ton of arithmetic to compute |
56 | 49 | // something that looks similar to a lens flare. |
57 | 50 | f16vec4 compute_blob(f16vec2 pos, f16vec4 blob, float16_t seed) |
58 | 51 | { |
59 | | - f16vec2 offset = pos - blob.xy; |
60 | | - f16vec4 rg_offset = offset.xxyy * f16vec4(0.95hf, 1.0hf, 0.95hf, 1.0hf); |
61 | | - f16vec4 bs_offset = offset.xxyy * f16vec4(1.05hf, 1.1hf + seed, 1.05hf, 1.1hf + seed); |
62 | | - |
63 | | - f16vec4 rg_dot = rg_offset * rg_offset; |
64 | | - f16vec4 bs_dot = bs_offset * bs_offset; |
65 | | - |
66 | | - // Dot products can be somewhat awkward in FP16, since the result is a scalar 16-bit value, and we don't want that. |
67 | | - // To that end, we compute at least two dot products side by side, and rg_offset and bs_offset are swizzled |
68 | | - // such that we avoid swizzling across a 32-bit boundary. |
69 | | - f16vec4 dots = f16vec4(rg_dot.xy + rg_dot.zw, bs_dot.xy + bs_dot.zw) * blob.w; |
70 | | - |
71 | | - // Now we have square distances to blob center. |
72 | | - |
73 | | - // Gotta have some FMAs, right? :D |
74 | | - dots = dots * dots + dots; |
75 | | - dots = dots * dots + dots; |
76 | | - dots = dots * dots + dots; |
77 | | - dots = dots * dots + dots; |
78 | | - dots = dots * dots + dots; |
79 | | - dots = dots * dots + dots; |
80 | | - |
81 | | - f16vec4 parabolas = max(f16vec4(1.0hf, 1.0hf, 1.0hf, 0.9hf) - dots, f16vec4(0.0hf)); |
82 | | - parabolas -= parabolas.w; |
83 | | - parabolas = max(parabolas, f16vec4(0.0hf)); |
84 | | - return parabolas; |
| 52 | + f16vec2 offset = pos - blob.xy; |
| 53 | + f16vec4 rg_offset = offset.xxyy * f16vec4(0.95hf, 1.0hf, 0.95hf, 1.0hf); |
| 54 | + f16vec4 bs_offset = offset.xxyy * f16vec4(1.05hf, 1.1hf + seed, 1.05hf, 1.1hf + seed); |
| 55 | + |
| 56 | + f16vec4 rg_dot = rg_offset * rg_offset; |
| 57 | + f16vec4 bs_dot = bs_offset * bs_offset; |
| 58 | + |
| 59 | + // Dot products can be somewhat awkward in FP16, since the result is a scalar 16-bit value, and we don't want that. |
| 60 | + // To that end, we compute at least two dot products side by side, and rg_offset and bs_offset are swizzled |
| 61 | + // such that we avoid swizzling across a 32-bit boundary. |
| 62 | + f16vec4 dots = f16vec4(rg_dot.xy + rg_dot.zw, bs_dot.xy + bs_dot.zw) * blob.w; |
| 63 | + |
| 64 | + // Now we have square distances to blob center. |
| 65 | + |
| 66 | + // Gotta have some FMAs, right? :D |
| 67 | + dots = dots * dots + dots; |
| 68 | + dots = dots * dots + dots; |
| 69 | + dots = dots * dots + dots; |
| 70 | + dots = dots * dots + dots; |
| 71 | + dots = dots * dots + dots; |
| 72 | + dots = dots * dots + dots; |
| 73 | + |
| 74 | + f16vec4 parabolas = max(f16vec4(1.0hf, 1.0hf, 1.0hf, 0.9hf) - dots, f16vec4(0.0hf)); |
| 75 | + parabolas -= parabolas.w; |
| 76 | + parabolas = max(parabolas, f16vec4(0.0hf)); |
| 77 | + return parabolas; |
85 | 78 | } |
86 | 79 |
|
87 | 80 | void main() |
88 | 81 | { |
89 | | - uint num_blobs = uint(registers.num_blobs); |
| 82 | + uint num_blobs = uint(registers.num_blobs); |
90 | 83 |
|
91 | | - float x = float(gl_GlobalInvocationID.x) / float(WIDTH) - 0.5; |
92 | | - float y = float(gl_GlobalInvocationID.y) / float(HEIGHT) - 0.5; |
93 | | - f16vec2 pos = f16vec2(x, y); |
94 | | - f16vec4 result = f16vec4(0.0hf); |
95 | | - float16_t seed = float16_t(registers.seed); |
96 | | - ivec2 range = ivec2(registers.range); |
| 84 | + float x = float(gl_GlobalInvocationID.x) / float(WIDTH) - 0.5; |
| 85 | + float y = float(gl_GlobalInvocationID.y) / float(HEIGHT) - 0.5; |
| 86 | + f16vec2 pos = f16vec2(x, y); |
| 87 | + f16vec4 result = f16vec4(0.0hf); |
| 88 | + float16_t seed = float16_t(registers.seed); |
| 89 | + ivec2 range = ivec2(registers.range); |
97 | 90 |
|
98 | | - const float16_t EXPAND_FACTOR = 0.3hf; |
99 | | - float16_t stride = seed * EXPAND_FACTOR; |
| 91 | + const float16_t EXPAND_FACTOR = 0.3hf; |
| 92 | + float16_t stride = seed * EXPAND_FACTOR; |
100 | 93 |
|
101 | | - for (uint i = 0; i < num_blobs; i++) |
102 | | - { |
103 | | - f16vec4 blob = blob_data[i]; |
| 94 | + for (uint i = 0; i < num_blobs; i++) |
| 95 | + { |
| 96 | + f16vec4 blob = blob_data[i]; |
104 | 97 |
|
105 | | - // Get as much mileage out of the buffer load as possible. |
106 | | - for (int y = -range.y; y <= range.y; y++) |
107 | | - for (int x = -range.x; x <= range.x; x++) |
108 | | - result += compute_blob(pos + stride * f16vec2(x, y), blob, seed); |
109 | | - } |
| 98 | + // Get as much mileage out of the buffer load as possible. |
| 99 | + for (int y = -range.y; y <= range.y; y++) |
| 100 | + for (int x = -range.x; x <= range.x; x++) |
| 101 | + result += compute_blob(pos + stride * f16vec2(x, y), blob, seed); |
| 102 | + } |
110 | 103 |
|
111 | | - imageStore(o_results, ivec2(gl_GlobalInvocationID.xy), result); |
| 104 | + imageStore(o_results, ivec2(gl_GlobalInvocationID.xy), result); |
112 | 105 | } |
0 commit comments