@@ -158,67 +158,80 @@ uint irr_glsl_workgroupBallotFindMSB();
158
158
159
159
160
160
// TODO: [[unroll]] the while 5-times ?
161
- #define IRR_GLSL_WORKGROUP_COMMON_IMPL_HEAD(CONV,INCLUSIVE_SUBGROUP_OP,VALUE,IDENTITY,INVCONV,ITEM_COUNT) SUBGROUP_SCRATCH_INITIALIZE(VALUE,ITEM_COUNT,IDENTITY,INVCONV) \
161
+ #define IRR_GLSL_WORKGROUP_COMMON_IMPL_HEAD(CONV,INCLUSIVE_SUBGROUP_OP,VALUE,IDENTITY,INVCONV,ITEM_COUNT,LOOP_POSTLUDE ) SUBGROUP_SCRATCH_INITIALIZE(VALUE,ITEM_COUNT,IDENTITY,INVCONV) \
162
162
const uint lastInvocation = ITEM_COUNT- 1u; \
163
163
uint lastInvocationInLevel = lastInvocation; \
164
164
const uint firstLevelScan = INVCONV(INCLUSIVE_SUBGROUP_OP(false,VALUE)); \
165
165
uint scan = firstLevelScan; \
166
166
const bool possibleProp = pseudoSubgroupInvocation== loMask; \
167
167
const uint subgroupSizeLog2 = findLSB(irr_glsl_SubgroupSize); \
168
168
const uint pseudoSubgroupID = (gl_LocalInvocationIndex>> subgroupSizeLog2); \
169
- const uint nextPassStoreIndex = irr_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset( \
169
+ const uint nextStoreIndex = irr_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset( \
170
170
irr_glsl_subgroup_impl_getSubgroupEmulationMemoryStart( \
171
171
irr_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask,pseudoSubgroupID) \
172
172
), \
173
173
irr_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask,pseudoSubgroupID) \
174
174
); \
175
- bool participate = gl_LocalInvocationIndex<= lastInvocationInLevel;
176
-
177
- #define IRR_GLSL_WORKGROUP_REDUCE_IMPL_TAIL(CONV,INCLUSIVE_SUBGROUP_OP,INVCONV) while (lastInvocationInLevel>= irr_glsl_SubgroupSize) \
175
+ uint storeIndex = nextStoreIndex; \
176
+ uint loadIndex = subgroupScanStoreOffset; \
177
+ bool participate = gl_LocalInvocationIndex<= lastInvocationInLevel; \
178
+ while (lastInvocationInLevel>= irr_glsl_SubgroupSize* irr_glsl_SubgroupSize) \
178
179
{ \
179
180
CONDITIONAL_BARRIER \
180
181
if (participate) \
181
182
{ \
182
183
if (any (bvec2 (gl_LocalInvocationIndex== lastInvocationInLevel,possibleProp))) \
183
- _IRR_GLSL_SCRATCH_SHARED_DEFINED_[nextPassStoreIndex ] = scan; \
184
+ _IRR_GLSL_SCRATCH_SHARED_DEFINED_[storeIndex ] = scan; \
184
185
} \
185
186
barrier(); \
186
187
participate = gl_LocalInvocationIndex<= (lastInvocationInLevel>>= subgroupSizeLog2); \
187
188
if (participate) \
188
189
{ \
189
- const uint prevLevelScan = _IRR_GLSL_SCRATCH_SHARED_DEFINED_[subgroupScanStoreOffset ]; \
190
+ const uint prevLevelScan = _IRR_GLSL_SCRATCH_SHARED_DEFINED_[loadIndex ]; \
190
191
scan = INVCONV(INCLUSIVE_SUBGROUP_OP(false,CONV(prevLevelScan))); \
191
192
} \
193
+ LOOP_POSTLUDE \
192
194
} \
193
- CONDITIONAL_BARRIER
194
-
195
- #define IRR_GLSL_WORKGROUP_SCAN_IMPL_TAIL(CONV,INCLUSIVE_SUBGROUP_OP,INVCONV,OP) ;
196
- /*
197
- #define IRR_GLSL_WORKGROUP_SCAN_IMPL_TAIL(CONV,INCLUSIVE_SUBGROUP_OP,INVCONV,OP) while (lastInvocationInLevel>=irr_glsl_SubgroupSize) \
195
+ if (lastInvocationInLevel>= irr_glsl_SubgroupSize) \
198
196
{ \
199
197
CONDITIONAL_BARRIER \
200
- const bool prop = propagateReduction&&gl_LocalInvocationIndex<lastInvocationInLevel || gl_LocalInvocationIndex==lastInvocationInLevel; \
201
- if (prop) \
202
- _IRR_GLSL_SCRATCH_SHARED_DEFINED_[lowerIndex] = scan; \
198
+ if (participate) \
199
+ { \
200
+ if (any (bvec2 (gl_LocalInvocationIndex== lastInvocationInLevel,possibleProp))) \
201
+ _IRR_GLSL_SCRATCH_SHARED_DEFINED_[storeIndex] = scan; \
202
+ } \
203
203
barrier(); \
204
- memoryBarrierShared(); \
205
- lastInvocationInLevel >>= subgroupSizeLog2; \
206
- if (gl_LocalInvocationIndex<=lastInvocationInLevel) \
204
+ participate = gl_LocalInvocationIndex<= (lastInvocationInLevel>>= subgroupSizeLog2); \
205
+ if (participate) \
207
206
{ \
208
- const uint prevLevelScan = _IRR_GLSL_SCRATCH_SHARED_DEFINED_[lowerIndex+lo2HiIndex ]; \
207
+ const uint prevLevelScan = _IRR_GLSL_SCRATCH_SHARED_DEFINED_[loadIndex ]; \
209
208
scan = INVCONV(INCLUSIVE_SUBGROUP_OP(false,CONV(prevLevelScan))); \
210
209
} \
211
- lowerIndex += lastInvocationInLevel+1u; \
212
- } \
213
- CONDITIONAL_BARRIER \
210
+ }
211
+
212
+ #define IRR_GLSL_WORKGROUP_SCAN_IMPL_LOOP_POSTLUDE { \
213
+ const uint memoryUsedThisPass = lastInvocationInLevel+ 1u; \
214
+ storeIndex += memoryUsedThisPass; \
215
+ loadIndex += memoryUsedThisPass; \
216
+ }
217
+
218
+ #define IRR_GLSL_WORKGROUP_SCAN_IMPL_TAIL(EXCLUSIVE,CONV,INCLUSIVE_SUBGROUP_OP,INVCONV,OP) CONDITIONAL_BARRIER \
214
219
if (lastInvocation>= irr_glsl_SubgroupSize) \
215
220
{ \
216
- lowerIndex -= lastInvocationInLevel; \
217
221
if (gl_LocalInvocationIndex< lastInvocationInLevel) \
218
- _IRR_GLSL_SCRATCH_SHARED_DEFINED_[lowerIndex+lo2HiIndex] = scan; \
219
- lowerIndex--; \
222
+ _IRR_GLSL_SCRATCH_SHARED_DEFINED_[loadIndex+ 1u] = scan; \
220
223
barrier(); \
221
- memoryBarrierShared(); \
224
+ } \
225
+ if (EXCLUSIVE) \
226
+ { \
227
+ const uint sharedOffsetOutTheWay = lastInvocationInLevel+ gl_LocalInvocationIndex; \
228
+ _IRR_GLSL_SCRATCH_SHARED_DEFINED_[sharedOffsetOutTheWay+ 1u] = firstLevelScan; \
229
+ barrier(); \
230
+ return CONV(gl_LocalInvocationIndex!= 0u ? _IRR_GLSL_SCRATCH_SHARED_DEFINED_[sharedOffsetOutTheWay]: 0u); \
231
+ } \
232
+ else \
233
+ return CONV(firstLevelScan);
234
+ /*
222
235
const uint shiftedInvocationIndex = gl_LocalInvocationIndex+irr_glsl_SubgroupSize; \
223
236
const uint lo2HiIndexDownsweep = lo2HiIndex+irr_glsl_SubgroupSize; \
224
237
for (uint logShift=(findMSB(lastInvocation)/subgroupSizeLog2-1u)*subgroupSizeLog2; logShift>0u; logShift-=subgroupSizeLog2) \
@@ -236,8 +249,8 @@ uint irr_glsl_workgroupBallotFindMSB();
236
249
} \
237
250
if (shiftedInvocationIndex<=lastInvocation) \
238
251
firstLevelScan = INVCONV(OP (CONV(firstLevelScan),CONV(_IRR_GLSL_SCRATCH_SHARED_DEFINED_[lowerIndex-1u]))); \
239
- }
240
252
*/
253
+
241
254
uint irr_glsl_workgroupBallotScanBitCount_impl(in bool exclusive);
242
255
243
256
uint irr_glsl_workgroupBallotInclusiveBitCount()
@@ -249,22 +262,20 @@ uint irr_glsl_workgroupBallotExclusiveBitCount()
249
262
return irr_glsl_workgroupBallotScanBitCount_impl(true);
250
263
}
251
264
265
+ uint irr_glsl_workgroupBallotScanBitCount_impl_impl(in uint localBitfield)
266
+ {
267
+ barrier();
268
+ IRR_GLSL_WORKGROUP_COMMON_IMPL_HEAD(irr_glsl_identityFunction,irr_glsl_subgroupInclusiveAdd_impl,localBitfield,0u,irr_glsl_identityFunction,irr_glsl_workgroupBallot_impl_BitfieldDWORDs,IRR_GLSL_WORKGROUP_SCAN_IMPL_LOOP_POSTLUDE)
269
+ IRR_GLSL_WORKGROUP_SCAN_IMPL_TAIL(true,irr_glsl_identityFunction,irr_glsl_subgroupInclusiveAdd_impl,irr_glsl_identityFunction,irr_glsl_add)
270
+ }
252
271
uint irr_glsl_workgroupBallotScanBitCount_impl(in bool exclusive)
253
272
{
254
273
uint localBitfieldBackup;
255
274
if (gl_LocalInvocationIndex< irr_glsl_workgroupBallot_impl_BitfieldDWORDs)
256
275
localBitfieldBackup = _IRR_GLSL_SCRATCH_SHARED_DEFINED_[gl_LocalInvocationIndex];
257
276
258
277
// scan hierarchically
259
- uint globalCount;
260
- {
261
- barrier();
262
- IRR_GLSL_WORKGROUP_COMMON_IMPL_HEAD(irr_glsl_identityFunction,irr_glsl_subgroupInclusiveAdd_impl,localBitfieldBackup,0u,irr_glsl_identityFunction,irr_glsl_workgroupBallot_impl_BitfieldDWORDs)
263
- IRR_GLSL_WORKGROUP_SCAN_IMPL_TAIL(irr_glsl_identityFunction,irr_glsl_subgroupInclusiveAdd_impl,irr_glsl_identityFunction,irr_glsl_add)
264
- _IRR_GLSL_SCRATCH_SHARED_DEFINED_[lastInvocationInLevel+ 1u+ gl_LocalInvocationIndex] = firstLevelScan;
265
- barrier();
266
- globalCount = gl_LocalInvocationIndex!= 0u ? _IRR_GLSL_SCRATCH_SHARED_DEFINED_[lastInvocationInLevel+ gl_LocalInvocationIndex]: 0u;
267
- }
278
+ uint globalCount = irr_glsl_workgroupBallotScanBitCount_impl_impl(localBitfieldBackup);
268
279
269
280
// restore
270
281
if (gl_LocalInvocationIndex< irr_glsl_workgroupBallot_impl_BitfieldDWORDs)
0 commit comments