@@ -78,81 +78,83 @@ struct reduce
78
78
};
79
79
80
80
template<class BinOp, bool Exclusive, uint16_t ItemCount>
81
- struct scan : reduce<BinOp,ItemCount>
81
+ struct scan// : reduce<BinOp,ItemCount> https://github.com/microsoft/DirectXShaderCompiler/issues/5966
82
82
{
83
83
using base_t = reduce<BinOp,ItemCount>;
84
+ base_t __base;
84
85
using type_t = typename base_t::type_t;
85
86
86
87
template<class Accessor>
87
88
type_t __call (NBL_CONST_REF_ARG (type_t) value, NBL_REF_ARG (Accessor) scratchAccessor)
88
89
{
89
- base_t:: template __call<Accessor>(value,scratchAccessor);
90
+ __base. template __call<Accessor>(value,scratchAccessor);
90
91
91
92
const uint16_t subgroupID = uint16_t (glsl::gl_SubgroupID ());
92
93
// abuse integer wraparound to map 0 to 0xffffu
93
94
const uint16_t prevSubgroupID = subgroupID-1 ;
94
95
95
96
// important check to prevent weird `firstbithigh` overlflows
96
- if (base_t::lastInvocation>=uint16_t (glsl::gl_SubgroupSize ()))
97
+ const uint16_t lastInvocation = ItemCount-1 ;
98
+ if (lastInvocation>=uint16_t (glsl::gl_SubgroupSize ()))
97
99
{
98
100
const uint16_t subgroupSizeLog2 = uint16_t (glsl::gl_SubgroupSizeLog2 ());
99
101
// different than Upsweep cause we need to translate high level inclusive scans into exclusive on the fly, so we get the value of the subgroup behind our own in each level
100
102
const uint16_t storeLoadIndexDiff = SubgroupContiguousIndex ()-prevSubgroupID;
101
103
102
104
BinOp binop;
103
105
// because DXC doesn't do references and I need my "frozen" registers
104
- #define scanStoreIndex base_t:: scanLoadIndex
106
+ #define scanStoreIndex __base. scanLoadIndex
105
107
// we sloop over levels from highest to penultimate
106
108
// as we iterate some previously active (higher level) invocations hold their exclusive prefix sum in `lastLevelScan`
107
- const uint16_t temp = firstbithigh (base_t:: lastInvocation)/subgroupSizeLog2; // doing division then multiplication might be optimized away by the compiler
108
- const uint16_t initialLogShift = temp * subgroupSizeLog2;
109
+ const uint16_t temp = uint16_t ( firstbithigh (uint32_t ( lastInvocation)) /subgroupSizeLog2) ; // doing division then multiplication might be optimized away by the compiler
110
+ const uint16_t initialLogShift = temp* subgroupSizeLog2;
109
111
// TODO: later [unroll(scan_levels<ItemCount,MinSubgroupSize>::value-1)]
110
112
[unroll (1 )]
111
113
for (uint16_t logShift=initialLogShift; bool (logShift); logShift-=subgroupSizeLog2)
112
114
{
113
115
// on the first iteration gl_SubgroupID==0 will participate but not afterwards because binop operand is identity
114
- if (base_t:: participate)
116
+ if (__base. participate)
115
117
{
116
118
// we need to add the higher level invocation exclusive prefix sum to current value
117
119
if (logShift!=initialLogShift) // but the top level doesn't have any level above itself
118
120
{
119
121
// this is fine if on the way up you also += under `if (participate)`
120
- scanStoreIndex -= base_t:: lastInvocationInLevel+1 ;
121
- base_t:: lastLevelScan = binop (base_t:: lastLevelScan,scratchAccessor.get (scanStoreIndex));
122
+ scanStoreIndex -= __base. lastInvocationInLevel+1 ;
123
+ __base. lastLevelScan = binop (__base. lastLevelScan,scratchAccessor.get (scanStoreIndex));
122
124
}
123
125
// now `lastLevelScan` has current level's inclusive prefux sum computed properly
124
126
// note we're overwriting the same location with same invocation so no barrier needed
125
127
// we store everything even though we'll never use the last entry due to shuffleup on read
126
- scratchAccessor.set (scanStoreIndex,base_t:: lastLevelScan);
128
+ scratchAccessor.set (scanStoreIndex,__base. lastLevelScan);
127
129
}
128
130
scratchAccessor.workgroupExecutionAndMemoryBarrier ();
129
131
// we're sneaky and exclude `gl_SubgroupID==0` from participation by abusing integer underflow
130
- base_t:: participate = prevSubgroupID<base_t:: lastInvocationInLevel;
131
- if (base_t:: participate)
132
+ __base. participate = prevSubgroupID<__base. lastInvocationInLevel;
133
+ if (__base. participate)
132
134
{
133
135
// we either need to prevent OOB read altogether OR cmov identity after the far
134
- base_t:: lastLevelScan = scratchAccessor.get (scanStoreIndex-storeLoadIndexDiff);
136
+ __base. lastLevelScan = scratchAccessor.get (scanStoreIndex-storeLoadIndexDiff);
135
137
}
136
- base_t:: lastInvocationInLevel = base_t:: lastInvocation>>logShift;
138
+ __base. lastInvocationInLevel = lastInvocation>>logShift;
137
139
}
138
140
#undef scanStoreIndex
139
141
140
- //assert((base_t:: lastInvocation>>subgroupSizeLog2)==base_t:: lastInvocationInLevel);
142
+ //assert((__base. lastInvocation>>subgroupSizeLog2)==__base. lastInvocationInLevel);
141
143
142
144
// the very first prefix sum we did is in a register, not Accessor scratch mem hence the special path
143
- if (prevSubgroupID<base_t:: lastInvocationInLevel)
144
- base_t:: firstLevelScan = binop (base_t:: lastLevelScan,base_t:: firstLevelScan);
145
+ if (prevSubgroupID<__base. lastInvocationInLevel)
146
+ __base. firstLevelScan = binop (__base. lastLevelScan,__base. firstLevelScan);
145
147
}
146
148
147
149
if (Exclusive)
148
150
{
149
- base_t:: firstLevelScan = glsl::subgroupShuffleUp (base_t:: firstLevelScan,1 );
151
+ __base. firstLevelScan = glsl::subgroupShuffleUp (__base. firstLevelScan,1 );
150
152
// shuffle doesn't work between subgroups but the value for each elected subgroup invocation is just the previous higherLevelExclusive
151
153
// note that we assume we might have to do scans with itemCount <= gl_WorkgroupSize
152
154
if (glsl::subgroupElect ())
153
- base_t:: firstLevelScan = bool (subgroupID) ? base_t:: lastLevelScan:BinOp::identity;
155
+ __base. firstLevelScan = bool (subgroupID) ? __base. lastLevelScan:BinOp::identity;
154
156
}
155
- return base_t:: firstLevelScan;
157
+ return __base. firstLevelScan;
156
158
}
157
159
};
158
160
}
0 commit comments