@@ -77,39 +77,54 @@ struct exclusive_scan
77
77
*/
78
78
namespace impl
79
79
{
80
- template<uint16_t ItemCount , class BallotAccessor, class ArithmeticAccessor, template< class ,uint16_t> class op_t >
81
- uint32_t ballotPolyCount (NBL_REF_ARG (BallotAccessor) ballotAccessor, NBL_REF_ARG (ArithmeticAccessor) arithmeticAccessor, NBL_REF_ARG (uint32_t) localBitfield )
80
+ template<uint16_t DWORDCount , class BallotAccessor>
81
+ uint16_t ballotCountedBitDWORD (NBL_REF_ARG (BallotAccessor) ballotAccessor)
82
82
{
83
- localBitfield = 0u;
84
- if (SubgroupContiguousIndex ()<impl::BallotDWORDCount (ItemCount))
85
- localBitfield = ballotAccessor.get (SubgroupContiguousIndex ());
86
- return op_t<plus<uint32_t>,impl::ballot_dword_count<ItemCount>::value>::template __call<ArithmeticAccessor>(countbits (localBitfield),arithmeticAccessor);
83
+ const uint32_t index = SubgroupContiguousIndex ();
84
+ if (index<DWORDCount)
85
+ {
86
+ const uint32_t bitfield = ballotAccessor.get (index);
87
+ // FIXME: stip unused bits from bitfield
88
+ return uint16_t (countbits (bitfield));
89
+ }
90
+ return 0 ;
91
+ }
92
+
93
+ template<bool Exclusive, uint16_t ItemCount, class BallotAccessor, class ArithmeticAccessor>
94
+ uint16_t ballotScanBitCount (NBL_REF_ARG (BallotAccessor) ballotAccessor, NBL_REF_ARG (ArithmeticAccessor) arithmeticAccessor)
95
+ {
96
+ const uint32_t localBitfield = ballotAccessor.get (impl::getDWORD (SubgroupContiguousIndex ()));
97
+
98
+ static const uint16_t DWORDCount = impl::ballot_dword_count<ItemCount>::value;
99
+ const uint32_t count = exclusive_scan<plus<uint32_t>,DWORDCount>::template __call<ArithmeticAccessor>(
100
+ ballotCountedBitDWORD<DWORDCount,BallotAccessor>(ballotAccessor),
101
+ arithmeticAccessor
102
+ );
103
+ return uint16_t (countbits (localBitfield&(Exclusive ? glsl::gl_SubgroupLtMask ():glsl::gl_SubgroupLeMask ())[0 ]));
104
+ // return uint16_t(countbits(localBitfield&(Exclusive ? glsl::gl_SubgroupLtMask():glsl::gl_SubgroupLeMask())[0])+count);
87
105
}
88
106
}
89
107
90
108
template<uint16_t ItemCount, class BallotAccessor, class ArithmeticAccessor>
91
109
uint16_t ballotBitCount (NBL_REF_ARG (BallotAccessor) ballotAccessor, NBL_REF_ARG (ArithmeticAccessor) arithmeticAccessor)
92
110
{
93
- uint32_t dummy;
94
- return uint16_t (impl::ballotPolyCount<ItemCount,BallotAccessor,ArithmeticAccessor,reduction>(ballotAccessor,arithmeticAccessor,dummy));
111
+ static const uint16_t DWORDCount = impl::ballot_dword_count<ItemCount>::value;
112
+ return uint16_t (reduction<plus<uint32_t>,DWORDCount>::template __call<ArithmeticAccessor>(
113
+ impl::ballotCountedBitDWORD<DWORDCount,BallotAccessor>(ballotAccessor),
114
+ arithmeticAccessor
115
+ ));
95
116
}
96
117
97
118
template<uint16_t ItemCount, class BallotAccessor, class ArithmeticAccessor>
98
119
uint16_t ballotInclusiveBitCount (NBL_REF_ARG (BallotAccessor) ballotAccessor, NBL_REF_ARG (ArithmeticAccessor) arithmeticAccessor)
99
120
{
100
- uint32_t localBitfield;
101
- uint32_t count = impl::ballotPolyCount<ItemCount,BallotAccessor,ArithmeticAccessor,exclusive_scan>(ballotAccessor,arithmeticAccessor,localBitfield);
102
- // only using part of the mask is on purpose, I'm only interested in LSB
103
- return uint16_t (countbits (glsl::gl_SubgroupLeMask ()[0 ]&localBitfield)+count);
121
+ return impl::ballotScanBitCount<false ,ItemCount,BallotAccessor,ArithmeticAccessor>(ballotAccessor,arithmeticAccessor);
104
122
}
105
123
106
124
template<uint16_t ItemCount, class BallotAccessor, class ArithmeticAccessor>
107
125
uint16_t ballotExclusiveBitCount (NBL_REF_ARG (BallotAccessor) ballotAccessor, NBL_REF_ARG (ArithmeticAccessor) arithmeticAccessor)
108
126
{
109
- uint32_t localBitfield;
110
- uint32_t count = impl::ballotPolyCount<ItemCount,BallotAccessor,ArithmeticAccessor,exclusive_scan>(ballotAccessor,arithmeticAccessor,localBitfield);
111
- // only using part of the mask is on purpose, I'm only interested in LSB
112
- return uint16_t (countbits (glsl::gl_SubgroupLtMask ()[0 ]&localBitfield)+count);
127
+ return impl::ballotScanBitCount<true ,ItemCount,BallotAccessor,ArithmeticAccessor>(ballotAccessor,arithmeticAccessor);
113
128
}
114
129
115
130
}
0 commit comments