2
2
#define _IRR_BUILTIN_GLSL_WORKGROUP_BALLOT_INCLUDED_
3
3
4
4
5
- #include < irr/ builtin/ glsl/ subgroup/ arithmetic_portability.glsl>
6
- #include < irr/ builtin/ glsl/ workgroup/ basic.glsl>
7
-
8
-
9
- #define irr_glsl_workgroupBallot_impl_getDWORD(IX) (IX>> 5 )
10
- #define irr_glsl_workgroupBallot_impl_BitfieldDWORDs irr_glsl_workgroupBallot_impl_getDWORD(_IRR_GLSL_WORKGROUP_SIZE_+ 31 )
11
5
12
-
13
- #define _IRR_GLSL_WORKGROUP_BALLOT_SHARED_SIZE_NEEDED_IMPL_ (irr_glsl_workgroupBallot_impl_BitfieldDWORDs+ _IRR_GLSL_WORKGROUP_SIZE_)
6
+ #include < irr/ builtin/ glsl/ workgroup/ shared_ballot.glsl>
14
7
15
8
16
9
/*
17
10
#ifdef GL_KHR_subgroup_arithmetic
18
11
12
+
19
13
#define CONDITIONAL_BARRIER
20
14
15
+ // just do nothing here
16
+ #define SUBGROUP_SCRATCH_INITIALIZE(IDENTITY) ;
17
+
18
+
21
19
#else
22
20
*/
23
21
@@ -28,12 +26,6 @@ If `GL_KHR_subgroup_arithmetic` is not available then these functions require em
28
26
`irr_glsl_workgroupOp`s then the workgroup size must not be smaller than half a subgroup but having workgroups smaller than a subgroup is extremely bad practice.
29
27
*/
30
28
31
- #if IRR_GLSL_GREATER(_IRR_GLSL_SUBGROUP_ARITHMETIC_EMULATION_SHARED_SIZE_NEEDED_,_IRR_GLSL_WORKGROUP_BALLOT_SHARED_SIZE_NEEDED_IMPL_)
32
- #define _IRR_GLSL_WORKGROUP_BALLOT_SHARED_SIZE_NEEDED_ _IRR_GLSL_SUBGROUP_ARITHMETIC_EMULATION_SHARED_SIZE_NEEDED_
33
- #else
34
- #define _IRR_GLSL_WORKGROUP_BALLOT_SHARED_SIZE_NEEDED_ _IRR_GLSL_WORKGROUP_BALLOT_SHARED_SIZE_NEEDED_IMPL_
35
- #endif
36
-
37
29
// #endif
38
30
39
31
@@ -45,12 +37,17 @@ If `GL_KHR_subgroup_arithmetic` is not available then these functions require em
45
37
#else
46
38
#if IRR_GLSL_GREATER(_IRR_GLSL_WORKGROUP_BALLOT_SHARED_SIZE_NEEDED_,0 )
47
39
#define _IRR_GLSL_SCRATCH_SHARED_DEFINED_ irr_glsl_workgroupBallotScratchShared
40
+ #define _IRR_GLSL_SCRATCH_SHARED_SIZE_DEFINED_ _IRR_GLSL_WORKGROUP_BALLOT_SHARED_SIZE_NEEDED_
48
41
shared uint _IRR_GLSL_SCRATCH_SHARED_DEFINED_[_IRR_GLSL_WORKGROUP_BALLOT_SHARED_SIZE_NEEDED_];
49
42
#endif
50
43
#endif
51
44
52
45
53
46
47
+ #include < irr/ builtin/ glsl/ subgroup/ arithmetic_portability.glsl>
48
+
49
+
50
+
54
51
// puts the result into shared memory at offsets [0,_IRR_GLSL_WORKGROUP_SIZE_/32)
55
52
void irr_glsl_workgroupBallot_noBarriers(in bool value)
56
53
{
@@ -166,13 +163,8 @@ uint irr_glsl_workgroupBallotFindMSB();
166
163
const bool possibleProp = pseudoSubgroupInvocation== loMask; \
167
164
const uint subgroupSizeLog2 = findLSB(irr_glsl_SubgroupSize); \
168
165
const uint pseudoSubgroupID = (gl_LocalInvocationIndex>> subgroupSizeLog2); \
169
- const uint nextStoreIndex = irr_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset( \
170
- irr_glsl_subgroup_impl_getSubgroupEmulationMemoryStart( \
171
- irr_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask,pseudoSubgroupID) \
172
- ), \
173
- irr_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask,pseudoSubgroupID) \
174
- ); \
175
- uint scanStoreIndex = (ITEM_COUNT<< 1u)+ gl_LocalInvocationIndex; \
166
+ const uint nextStoreIndex = irr_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask,pseudoSubgroupID); \
167
+ uint scanStoreIndex = irr_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask,lastInvocation)+ gl_LocalInvocationIndex+ 1u; \
176
168
bool participate = gl_LocalInvocationIndex<= lastInvocationInLevel; \
177
169
while (lastInvocationInLevel>= irr_glsl_SubgroupSize* irr_glsl_SubgroupSize) \
178
170
{ \
@@ -229,15 +221,18 @@ uint irr_glsl_workgroupBallotFindMSB();
229
221
if (gl_LocalInvocationIndex<= lastInvocation && pseudoSubgroupID!= 0u) \
230
222
{ \
231
223
const uint higherLevelExclusive = _IRR_GLSL_SCRATCH_SHARED_DEFINED_[scanLoadIndex+ currentToHighLevel- 1u]; \
232
- firstLevelScan = INVCONV(OP (CONV(higherLevelExclusive),CONV(firstLevelScan))); \
224
+ firstLevelScan = INVCONV(OP(CONV(higherLevelExclusive), CONV(firstLevelScan))); \
233
225
} \
234
226
} \
235
227
if (EXCLUSIVE) \
236
228
{ \
237
- const uint sharedOffsetOutTheWay = scanStoreIndex+ lastInvocationInLevel; \
238
- _IRR_GLSL_SCRATCH_SHARED_DEFINED_[sharedOffsetOutTheWay+ 1u] = firstLevelScan; \
239
- barrier(); \
240
- return gl_LocalInvocationIndex!= 0u ? CONV(_IRR_GLSL_SCRATCH_SHARED_DEFINED_[sharedOffsetOutTheWay]): IDENTITY; \
229
+ if (gl_LocalInvocationIndex< lastInvocation) \
230
+ _IRR_GLSL_SCRATCH_SHARED_DEFINED_[gl_LocalInvocationIndex+ 1u] = firstLevelScan; \
231
+ barrier(); \
232
+ if (gl_LocalInvocationIndex< lastInvocation) \
233
+ return gl_LocalInvocationIndex!= 0u ? CONV(_IRR_GLSL_SCRATCH_SHARED_DEFINED_[gl_LocalInvocationIndex]): IDENTITY; \
234
+ else \
235
+ return IDENTITY; \
241
236
} \
242
237
else \
243
238
return CONV(firstLevelScan);
@@ -262,20 +257,28 @@ uint irr_glsl_workgroupBallotScanBitCount_impl_impl(in uint localBitfield)
262
257
}
263
258
uint irr_glsl_workgroupBallotScanBitCount_impl(in bool exclusive)
264
259
{
265
- uint localBitfieldBackup;
266
- if (gl_LocalInvocationIndex< irr_glsl_workgroupBallot_impl_BitfieldDWORDs)
267
- localBitfieldBackup = _IRR_GLSL_SCRATCH_SHARED_DEFINED_[gl_LocalInvocationIndex];
260
+ const uint _dword = irr_glsl_workgroupBallot_impl_getDWORD(gl_LocalInvocationIndex);
261
+ const uint localBitfield = _IRR_GLSL_SCRATCH_SHARED_DEFINED_[_dword];
268
262
269
- // scan hierarchically
270
- uint globalCount = irr_glsl_workgroupBallotScanBitCount_impl_impl(localBitfieldBackup);
271
-
272
- // restore
273
- if (gl_LocalInvocationIndex< irr_glsl_workgroupBallot_impl_BitfieldDWORDs)
274
- _IRR_GLSL_SCRATCH_SHARED_DEFINED_[gl_LocalInvocationIndex] = localBitfieldBackup;
275
- barrier();
263
+ uint globalCount;
264
+ {
265
+ uint localBitfieldBackup;
266
+ if (gl_LocalInvocationIndex< irr_glsl_workgroupBallot_impl_BitfieldDWORDs)
267
+ localBitfieldBackup = _IRR_GLSL_SCRATCH_SHARED_DEFINED_[gl_LocalInvocationIndex];
268
+ // scan hierarchically, invocations with `gl_LocalInvocationIndex>=irr_glsl_workgroupBallot_impl_BitfieldDWORDs` will have garbage here
269
+ irr_glsl_workgroupBallotScanBitCount_impl_impl(localBitfieldBackup);
270
+ // fix it (abuse the fact memory is left over)
271
+ globalCount = _dword!= 0u ? _IRR_GLSL_SCRATCH_SHARED_DEFINED_[_dword]: 0u;
272
+ barrier();
273
+
274
+ // restore
275
+ if (gl_LocalInvocationIndex< irr_glsl_workgroupBallot_impl_BitfieldDWORDs)
276
+ _IRR_GLSL_SCRATCH_SHARED_DEFINED_[gl_LocalInvocationIndex] = localBitfieldBackup;
277
+ barrier();
278
+ }
276
279
277
280
const uint mask = 0xffffffffu>> ((exclusive ? 32u: 31u)- (gl_LocalInvocationIndex& 31u));
278
- return globalCount+ bitCount(localBitfieldBackup & mask);
281
+ return globalCount+ bitCount(localBitfield & mask);
279
282
}
280
283
281
284
#undef CONDITIONAL_BARRIER
0 commit comments