Skip to content

Commit 7dc35b1

Browse files
fix a tiny bug affecting workgroup scans of ItemCount!=WorkgroupSize
1 parent 4f257ba commit 7dc35b1

File tree

2 files changed

+8
-7
lines changed

2 files changed

+8
-7
lines changed

include/nbl/builtin/hlsl/workgroup/shared_scan.hlsl

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,13 @@ struct reduce
2929
const uint16_t lastInvocation = ItemCount-1;
3030
const uint16_t subgroupMask = uint16_t(glsl::gl_SubgroupSize()-1u);
3131

32-
lastInvocationInLevel = lastInvocation;
33-
3432
subgroup::inclusive_scan<BinOp> subgroupOp;
35-
firstLevelScan = subgroupOp(value);
33+
34+
lastInvocationInLevel = lastInvocation;
35+
scanLoadIndex = SubgroupContiguousIndex();
36+
participate = scanLoadIndex<=lastInvocationInLevel;
37+
38+
firstLevelScan = subgroupOp(participate ? value:BinOp::identity);
3639
type_t scan = firstLevelScan;
3740

3841
// could use ElectLast() but we can optimize for full workgroups here
@@ -44,10 +47,8 @@ struct reduce
4447
// Consequently, those first gl_SubgroupSz^2 invocations will store their results on gl_SubgroupSz scratch slots
4548
// and the next level will follow the same + the previous as an `offset`.
4649

47-
scanLoadIndex = SubgroupContiguousIndex();
4850
const uint16_t loadStoreIndexDiff = scanLoadIndex-uint16_t(glsl::gl_SubgroupID());
4951

50-
participate = scanLoadIndex<=lastInvocationInLevel;
5152
// to cancel out the index shift on the first iteration
5253
if (lastInvocationInLevel>subgroupMask)
5354
scanLoadIndex -= lastInvocationInLevel-1;
@@ -58,7 +59,7 @@ struct reduce
5859
scanLoadIndex += lastInvocationInLevel+1;
5960
// only invocations that have the final value of the subgroupOp (inclusive scan) store their results
6061
if (participate && (SubgroupContiguousIndex()==lastInvocationInLevel || isLastSubgroupInvocation))
61-
scratchAccessor.set(scanLoadIndex-loadStoreIndexDiff, scan); // For subgroupSz = 32, first 512 invocations store index is [0,15], 512-1023 [16,31] etc.
62+
scratchAccessor.set(scanLoadIndex-loadStoreIndexDiff,scan); // For subgroupSz = 32, first 512 invocations store index is [0,15], 512-1023 [16,31] etc.
6263
scratchAccessor.workgroupExecutionAndMemoryBarrier();
6364
participate = SubgroupContiguousIndex() <= (lastInvocationInLevel >>= glsl::gl_SubgroupSizeLog2());
6465
if(participate)

0 commit comments

Comments
 (0)