@@ -17,16 +17,59 @@ namespace nbl::video
17
17
#include " nbl/builtin/glsl/scan/default_scheduler.glsl"
18
18
static_assert (NBL_BUILTIN_MAX_SCAN_LEVELS&0x1 ," NBL_BUILTIN_MAX_SCAN_LEVELS must be odd!" );
19
19
20
- //
20
+ /* *
21
+ Utility class to help you perform the equivalent of `std::inclusive_scan` and `std::exclusive_scan` with data on the GPU.
22
+
23
+ The basic building block is a Blelloch-Scan, the `nbl_glsl_workgroup{Add/Mul/And/Xor/Or/Min/Max}{Exclusive/Inclusive}`:
24
+ https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
25
+ https://classes.engineering.wustl.edu/cse231/core/index.php/Scan
26
+
27
+ The workgroup scan is itself probably built out of Hillis-Steele subgroup scans, we use `KHR_shader_subgroup_arithmetic` whenever available,
28
+ but fall back to our own "software" emulation of subgroup arithmetic using Hillis-Steele and some scratch shared memory.
29
+
30
+ The way the workgroup scans are combined is amongst the most advanced in its class, because it performs the scan as a single dispatch
31
+ via some clever scheduling which allows it to also be used in "indirect mode", which is when you don't know the number of elements
32
+ that you'll be scanning on the CPU side. This is why it provides two flavours of the compute shader.
33
+
34
+ The scheduling relies on two principles:
35
+ - Virtual and Persistent Workgroups
36
+ - Atomic Counters as Sempahores
37
+
38
+ # Virtual Workgroups
39
+ TODO: Move this Paragraph somewhere else.
40
+ Generally speaking, launching a new workgroup has non-trivial overhead.
41
+
42
+ Also most IHVs, especially AMD have silly limits on the ranges of dispatches (like 64k workgroups), which also apply to 1D dispatches.
43
+
44
+ It becomes impossible to keep a simple 1 invocation to 1 data element relationship when processing a large buffer without reusing workgroups.
45
+
46
+ Virtual Persistent Workgroups is a $20 term for a $0.25 idea, its simply to do the following:
47
+ 1. Launch a 1D dispatch "just big enough" to saturate your GPU, `SPhysicalDeviceLimits::maxResidentInvocations` helps you figure out this number
48
+ 2. Make a single workgroup perform the task of multiple workgroups by repeating itself
49
+ 3. Instead of relying on `gl_WorkGroupID` or `gl_GlobalInvocationID` to find your work items, use your own ID unique for the virtual workgroup
50
+
51
+ This usually has the form of
52
+ ```glsl
53
+ for (uint virtualWorkgroupIndex=gl_GlobalInvocationID.x; virtualWorkgroupIndex<virtualWorkgroupCount; virtualWorkgroupIndex++)
54
+ {
55
+ // do actual work for a single workgroup
56
+ }
57
+ ```
58
+ **/
21
59
class NBL_API CScanner final : public core::IReferenceCounted
22
60
{
23
61
public:
24
62
enum E_SCAN_TYPE : uint8_t
25
63
{
64
+ // computes output[n] = Sum_{i<=n}(input[i])
26
65
EST_INCLUSIVE = _NBL_GLSL_SCAN_TYPE_INCLUSIVE_,
66
+ // computes output[n] = Sum_{i<n}(input[i]), meaning first element is identity
27
67
EST_EXCLUSIVE = _NBL_GLSL_SCAN_TYPE_EXCLUSIVE_,
28
68
EST_COUNT
29
69
};
70
+ // Only 4 byte wide data types supported due to need to trade the via shared memory,
71
+ // different combinations of data type and operator have different identity elements.
72
+ // `EDT_INT` and `EO_MIN` will have `INT_MAX` as identity, while `EDT_UINT` would have `UINT_MAX`
30
73
enum E_DATA_TYPE : uint8_t
31
74
{
32
75
EDT_UINT=0u ,
@@ -266,4 +309,4 @@ class NBL_API CScanner final : public core::IReferenceCounted
266
309
267
310
}
268
311
269
- #endif
312
+ #endif
0 commit comments