Skip to content

Commit d853685

Browse files
subgroup stuff fixed, added better ballot barrier semantics
1 parent a1abefd commit d853685

File tree

3 files changed

+31
-41
lines changed

3 files changed

+31
-41
lines changed

include/irr/builtin/glsl/ext/LumaMeter/common.glsl

Lines changed: 3 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -280,30 +280,9 @@ float irr_glsl_ext_LumaMeter_impl_getMeasuredLumaLog2(in irr_glsl_ext_LumaMeter_
280280
// TODO: figure out why the irr_glsl_workgroupExclusiveAdd function doesn't work
281281
uint irr_glsl_workgroupExclusiveAdd2(uint val)
282282
{
283-
#if 0
284-
uint pingpong = uint(_IRR_GLSL_EXT_LUMA_METER_BIN_COUNT);
285-
//! Bad INEFFICIENT Kogge-Stone adder, don't implement this way!
286-
for (int pass = 1; pass < _IRR_GLSL_EXT_LUMA_METER_BIN_COUNT; pass <<= 1)
287-
{
288-
uint index = gl_LocalInvocationIndex + pingpong;
289-
pingpong ^= _IRR_GLSL_EXT_LUMA_METER_BIN_COUNT;
290-
291-
_IRR_GLSL_SCRATCH_SHARED_DEFINED_[index] = val;
292-
barrier();
293-
memoryBarrierShared();
294-
if (gl_LocalInvocationIndex >= pass)
295-
val += _IRR_GLSL_SCRATCH_SHARED_DEFINED_[index - pass];
296-
}
297-
barrier();
298-
memoryBarrierShared();
299-
return val;
300-
#elif 1
301-
barrier();
302-
memoryBarrierShared();
303-
_IRR_GLSL_SCRATCH_SHARED_DEFINED_[gl_LocalInvocationIndex] = 0u;
304-
_IRR_GLSL_SCRATCH_SHARED_DEFINED_[_IRR_GLSL_EXT_LUMA_METER_BIN_COUNT + gl_LocalInvocationIndex] = 0u;
305283
barrier();
306284
memoryBarrierShared();
285+
#if 1
307286
const uint K = irr_glsl_SubgroupSize;
308287
const uint outIx = gl_LocalInvocationIndex/K;
309288
uint subScan = irr_glsl_subgroupInclusiveAdd(val);
@@ -322,6 +301,8 @@ uint irr_glsl_workgroupExclusiveAdd2(uint val)
322301
#else
323302
_IRR_GLSL_SCRATCH_SHARED_DEFINED_[gl_LocalInvocationIndex] = 0u;
324303
_IRR_GLSL_SCRATCH_SHARED_DEFINED_[_IRR_GLSL_EXT_LUMA_METER_BIN_COUNT+gl_LocalInvocationIndex] = 0u;
304+
barrier();
305+
memoryBarrierShared();
325306
return irr_glsl_workgroupExclusiveAdd(val);
326307
#endif
327308
}

include/irr/builtin/glsl/subgroup/arithmetic_portability.glsl

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#else
1717
*/
1818

19-
#define SUBGROUP_BARRIERS
19+
#define SUBGROUP_BARRIERS memoryBarrierShared()
2020

2121
//#endif
2222

@@ -141,28 +141,34 @@ TODO: Keep the pseudo subgroup and offset code DRY, move to a function.
141141
const uint pseudoSubgroupID = gl_LocalInvocationIndex&hiMask; \
142142
const uint scratchOffset = (pseudoSubgroupID<<1u)|pseudoSubgroupInvocation; \
143143
const uint primaryOffset = scratchOffset+irr_glsl_HalfSubgroupSize; \
144+
SUBGROUP_BARRIERS; \
144145
_IRR_GLSL_SCRATCH_SHARED_DEFINED_[primaryOffset] = INVCONV (VALUE); \
145146
if (CLEAR && pseudoSubgroupInvocation<irr_glsl_HalfSubgroupSize) \
146147
_IRR_GLSL_SCRATCH_SHARED_DEFINED_[scratchOffset] = INVCONV (IDENTITY); \
147148
SUBGROUP_BARRIERS; \
148149
VALUE = OP (VALUE,CONV (_IRR_GLSL_SCRATCH_SHARED_DEFINED_[primaryOffset-1u])); \
150+
SUBGROUP_BARRIERS; \
149151
_IRR_GLSL_SCRATCH_SHARED_DEFINED_[primaryOffset] = INVCONV (VALUE); \
150152
SUBGROUP_BARRIERS; \
151153
VALUE = OP (VALUE,CONV (_IRR_GLSL_SCRATCH_SHARED_DEFINED_[primaryOffset-2u])); \
152154
for (uint stp=irr_glsl_MinSubgroupSize; stp<irr_glsl_SubgroupSize; stp<<=1u) \
153155
{ \
156+
SUBGROUP_BARRIERS; \
154157
_IRR_GLSL_SCRATCH_SHARED_DEFINED_[primaryOffset] = INVCONV (VALUE); \
155158
SUBGROUP_BARRIERS; \
156159
VALUE = OP (VALUE,CONV (_IRR_GLSL_SCRATCH_SHARED_DEFINED_[primaryOffset-stp])); \
157-
}
160+
} \
161+
SUBGROUP_BARRIERS;
158162

159163

160164
#define IRR_GLSL_SUBGROUP_REDUCE(CONV,OP,VALUE,CLEAR,IDENTITY,INVCONV) IRR_GLSL_SUBGROUP_ARITHMETIC_IMPL(CONV,OP,VALUE,CLEAR,IDENTITY,INVCONV) \
161165
_IRR_GLSL_SCRATCH_SHARED_DEFINED_[primaryOffset] = INVCONV (VALUE); \
162166
SUBGROUP_BARRIERS; \
163167
const uint maxPseudoSubgroupInvocation = (_IRR_GLSL_WORKGROUP_SIZE_-1u)&loMask; \
164168
const uint maxPseudoSubgroupID = (_IRR_GLSL_WORKGROUP_SIZE_-1u)&hiMask; \
165-
return CONV(_IRR_GLSL_SCRATCH_SHARED_DEFINED_[((maxPseudoSubgroupID<<1u)|maxPseudoSubgroupInvocation)+irr_glsl_HalfSubgroupSize])
169+
const uint lastItem = _IRR_GLSL_SCRATCH_SHARED_DEFINED_[((maxPseudoSubgroupID<<1u)|maxPseudoSubgroupInvocation)+irr_glsl_HalfSubgroupSize]; \
170+
SUBGROUP_BARRIERS; \
171+
return CONV (lastItem);
166172

167173

168174

@@ -267,7 +273,9 @@ float irr_glsl_subgroupMax_impl(in bool clearScratchToIdentity, float value)
267273
#define IRR_GLSL_SUBGROUP_EXCLUSIVE_SCAN(CONV,OP,VALUE,CLEAR,IDENTITY,INVCONV) IRR_GLSL_SUBGROUP_ARITHMETIC_IMPL(CONV,OP,VALUE,CLEAR,IDENTITY,INVCONV) \
268274
_IRR_GLSL_SCRATCH_SHARED_DEFINED_[primaryOffset] = INVCONV (VALUE); \
269275
SUBGROUP_BARRIERS; \
270-
return CONV (_IRR_GLSL_SCRATCH_SHARED_DEFINED_[primaryOffset-1u])
276+
const uint prevItem = _IRR_GLSL_SCRATCH_SHARED_DEFINED_[primaryOffset-1u]; \
277+
SUBGROUP_BARRIERS; \
278+
return CONV (prevItem);
271279

272280

273281
uint irr_glsl_subgroupInclusiveAnd_impl(in bool clearScratchToIdentity, uint value)
@@ -348,16 +356,7 @@ float irr_glsl_subgroupExclusiveOr_impl(in bool clearScratchToIdentity, float va
348356

349357
uint irr_glsl_subgroupInclusiveAdd_impl(in bool clearScratchToIdentity, uint value)
350358
{
351-
const uint loMask = 3u;
352-
uint pseudoSubgroupInvocation = gl_LocalInvocationIndex & loMask;
353-
uint pseudoSubgroupID = gl_LocalInvocationIndex & (~loMask);
354-
_IRR_GLSL_SCRATCH_SHARED_DEFINED_[gl_LocalInvocationIndex] = value;
355-
barrier();
356-
memoryBarrierShared();
357-
for (uint i=pseudoSubgroupID; i<gl_LocalInvocationIndex; i++)
358-
value += _IRR_GLSL_SCRATCH_SHARED_DEFINED_[i];
359-
return value;
360-
//IRR_GLSL_SUBGROUP_INCLUSIVE_SCAN(irr_glsl_identityFunction,irr_glsl_add,value,clearScratchToIdentity,0u,irr_glsl_identityFunction);
359+
IRR_GLSL_SUBGROUP_INCLUSIVE_SCAN(irr_glsl_identityFunction,irr_glsl_add,value,clearScratchToIdentity,0u,irr_glsl_identityFunction);
361360
}
362361
int irr_glsl_subgroupInclusiveAdd_impl(in bool clearScratchToIdentity, int value)
363362
{

include/irr/builtin/glsl/workgroup/ballot.glsl

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ If `GL_KHR_subgroup_arithmetic` is not available then these functions require em
5959

6060

6161
// puts the result into shared memory at offsets [0,_IRR_GLSL_WORKGROUP_SIZE_/32)
62-
void irr_glsl_workgroupBallot(in bool value)
62+
void irr_glsl_workgroupBallot_noBarriers(in bool value)
6363
{
6464
// TODO: Optimization using subgroupBallot in an ifdef IRR_GL_something (need to do feature mapping first)
6565
if (gl_LocalInvocationIndex<irr_glsl_workgroupBallot_impl_BitfieldDWORDs)
@@ -68,6 +68,12 @@ void irr_glsl_workgroupBallot(in bool value)
6868
memoryBarrierShared();
6969
if (value)
7070
atomicOr(_IRR_GLSL_SCRATCH_SHARED_DEFINED_[irr_glsl_workgroupBallot_impl_getDWORD(gl_LocalInvocationIndex)],1u<<(gl_LocalInvocationIndex&31u));
71+
}
72+
void irr_glsl_workgroupBallot(in bool value)
73+
{
74+
barrier();
75+
memoryBarrierShared();
76+
irr_glsl_workgroupBallot_noBarriers(value);
7177
barrier();
7278
}
7379

@@ -118,7 +124,7 @@ uint irr_glsl_workgroupBallotBitCount()
118124
}
119125

120126

121-
uint irr_glsl_workgroupBroadcast_noEndBarriers(in uint val, in uint id)
127+
uint irr_glsl_workgroupBroadcast_noBarriers(in uint val, in uint id)
122128
{
123129
if (gl_LocalInvocationIndex==id)
124130
_IRR_GLSL_SCRATCH_SHARED_DEFINED_[irr_glsl_workgroupBallot_impl_BitfieldDWORDs] = val;
@@ -128,13 +134,15 @@ uint irr_glsl_workgroupBroadcast_noEndBarriers(in uint val, in uint id)
128134
}
129135
uint irr_glsl_workgroupBroadcast(in uint val, in uint id)
130136
{
131-
const uint retval = irr_glsl_workgroupBroadcast_noEndBarriers(val,id);
137+
barrier();
138+
memoryBarrierShared();
139+
const uint retval = irr_glsl_workgroupBroadcast_noBarriers(val,id);
132140
barrier();
133141
memoryBarrierShared();
134142
return retval;
135143
}
136144

137-
uint irr_glsl_workgroupBroadcastFirst_noEndBarriers(in uint val)
145+
uint irr_glsl_workgroupBroadcastFirst_noBarriers(in uint val)
138146
{
139147
if (irr_glsl_workgroupElect())
140148
_IRR_GLSL_SCRATCH_SHARED_DEFINED_[irr_glsl_workgroupBallot_impl_BitfieldDWORDs] = val;
@@ -144,7 +152,9 @@ uint irr_glsl_workgroupBroadcastFirst_noEndBarriers(in uint val)
144152
}
145153
uint irr_glsl_workgroupBroadcastFirst(in uint val)
146154
{
147-
const uint retval = irr_glsl_workgroupBroadcastFirst_noEndBarriers(val);
155+
barrier();
156+
memoryBarrierShared();
157+
const uint retval = irr_glsl_workgroupBroadcastFirst_noBarriers(val);
148158
barrier();
149159
memoryBarrierShared();
150160
return retval;

0 commit comments

Comments
 (0)