@@ -29,10 +29,13 @@ struct reduce
29
29
const uint16_t lastInvocation = ItemCount-1 ;
30
30
const uint16_t subgroupMask = uint16_t (glsl::gl_SubgroupSize ()-1u);
31
31
32
- lastInvocationInLevel = lastInvocation;
33
-
34
32
subgroup::inclusive_scan<BinOp> subgroupOp;
35
- firstLevelScan = subgroupOp (value);
33
+
34
+ lastInvocationInLevel = lastInvocation;
35
+ scanLoadIndex = SubgroupContiguousIndex ();
36
+ participate = scanLoadIndex<=lastInvocationInLevel;
37
+
38
+ firstLevelScan = subgroupOp (participate ? value:BinOp::identity);
36
39
type_t scan = firstLevelScan;
37
40
38
41
// could use ElectLast() but we can optimize for full workgroups here
@@ -44,10 +47,8 @@ struct reduce
44
47
// Consequently, those first gl_SubgroupSz^2 invocations will store their results on gl_SubgroupSz scratch slots
45
48
// and the next level will follow the same + the previous as an `offset`.
46
49
47
- scanLoadIndex = SubgroupContiguousIndex ();
48
50
const uint16_t loadStoreIndexDiff = scanLoadIndex-uint16_t (glsl::gl_SubgroupID ());
49
51
50
- participate = scanLoadIndex<=lastInvocationInLevel;
51
52
// to cancel out the index shift on the first iteration
52
53
if (lastInvocationInLevel>subgroupMask)
53
54
scanLoadIndex -= lastInvocationInLevel-1 ;
@@ -58,7 +59,7 @@ struct reduce
58
59
scanLoadIndex += lastInvocationInLevel+1 ;
59
60
// only invocations that have the final value of the subgroupOp (inclusive scan) store their results
60
61
if (participate && (SubgroupContiguousIndex ()==lastInvocationInLevel || isLastSubgroupInvocation))
61
- scratchAccessor.set (scanLoadIndex-loadStoreIndexDiff, scan); // For subgroupSz = 32, first 512 invocations store index is [0,15], 512-1023 [16,31] etc.
62
+ scratchAccessor.set (scanLoadIndex-loadStoreIndexDiff,scan); // For subgroupSz = 32, first 512 invocations store index is [0,15], 512-1023 [16,31] etc.
62
63
scratchAccessor.workgroupExecutionAndMemoryBarrier ();
63
64
participate = SubgroupContiguousIndex () <= (lastInvocationInLevel >>= glsl::gl_SubgroupSizeLog2 ());
64
65
if (participate)
0 commit comments