Skip to content

Commit cf1a600

Browse files
committed
Optimize nlz() function
Add intrinsic alternatives if supported
1 parent c1a98a0 commit cf1a600

File tree

3 files changed

+172
-12
lines changed

3 files changed

+172
-12
lines changed

include/slimcpplib/long_math.h

Lines changed: 90 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -141,17 +141,27 @@ constexpr type_t half_hi(type_t value) noexcept;
141141
template<typename type_t, std::enable_if_t<is_unsigned_v<type_t>, int> = 0>
142142
constexpr type_t half_make_hi(type_t value) noexcept;
143143

144+
// propagate most significant bit to the right
145+
146+
template<typename type_t, uint_t byte_count = byte_count_v<type_t>, std::enable_if_t<is_unsigned_v<type_t>, int> = 0>
147+
constexpr type_t pmsbr(type_t value) noexcept;
148+
149+
// calculate number of set bits
150+
151+
template<typename type_t, std::enable_if_t<is_unsigned_v<type_t>, int> = 0>
152+
constexpr uint_t popcnt(type_t value) noexcept;
153+
144154
// calculate leading zero bits
145155

146156
template<typename type_t, std::enable_if_t<is_unsigned_v<type_t>, int> = 0>
147157
constexpr uint_t nlz(type_t value) noexcept;
148158

149-
// shift bits to left
159+
// shift bits to the left
150160

151161
template<typename type_t, std::enable_if_t<is_unsigned_v<type_t>, int> = 0>
152162
constexpr type_t shl2(type_t value_hi, type_t value_lo, uint_t shift) noexcept;
153163

154-
// shift bits to right
164+
// shift bits to the right
155165

156166
template<typename type_t, std::enable_if_t<is_unsigned_v<type_t>, int> = 0>
157167
constexpr type_t shr2(type_t value_hi, type_t value_lo, uint_t shift) noexcept;
@@ -230,19 +240,87 @@ constexpr type_t half_make_hi(type_t value) noexcept
230240

231241

232242
////////////////////////////////////////////////////////////////////////////////////////////////////
233-
template<typename type_t, std::enable_if_t<is_unsigned_v<type_t>, int>>
234-
constexpr uint_t nlz(type_t value) noexcept
243+
template<typename type_t, uint_t byte_count, std::enable_if_t<is_unsigned_v<type_t>, int>>
244+
constexpr type_t pmsbr(type_t value) noexcept
235245
{
236-
uint_t result = 0;
237-
type_t mask = type_t(1) << (bit_count_v<type_t> - 1);
246+
if constexpr (byte_count == 1) {
238247

239-
while ((~value & mask) != 0) {
248+
value |= (value >> 1);
249+
value |= (value >> 2);
250+
value |= (value >> 4);
240251

241-
mask >>= 1;
242-
result++;
252+
} else {
253+
254+
value = pmsbr<type_t, byte_count / 2>(value);
255+
value |= (value >> (4 * byte_count));
243256
}
244257

245-
return result;
258+
return value;
259+
}
260+
261+
262+
263+
////////////////////////////////////////////////////////////////////////////////////////////////////
264+
template<typename type_t, uint_t bit_count = bit_count_v<type_t>, uint_t mask_count = bit_count_v<type_t> / bit_count, std::enable_if_t<is_unsigned_v<type_t>, int> = 0>
265+
constexpr type_t popcnt_msk() noexcept
266+
{
267+
if constexpr (mask_count == 1) {
268+
269+
return pmsbr<type_t>(type_t(1) << (bit_count / 2 - 1));
270+
271+
} else {
272+
273+
constexpr type_t mask = popcnt_msk<type_t, bit_count, mask_count - 1>();
274+
return mask | (mask << (bit_count));
275+
}
276+
}
277+
278+
279+
280+
////////////////////////////////////////////////////////////////////////////////////////////////////
281+
template<typename type_t, uint_t byte_count = byte_count_v<type_t>, std::enable_if_t<is_unsigned_v<type_t>, int> = 0>
282+
constexpr type_t popcnt_impl(type_t value) noexcept
283+
{
284+
if constexpr (byte_count <= 4) {
285+
286+
constexpr type_t mask2 = popcnt_msk<type_t, 2>();
287+
constexpr type_t mask4 = popcnt_msk<type_t, 4>();
288+
constexpr type_t mask8 = popcnt_msk<type_t, 8>();
289+
290+
value = value - ((value >> 1) & mask2);
291+
value = (value & mask4) + ((value >> 2) & mask4);
292+
value = (value + (value >> 4)) & mask8;
293+
294+
if constexpr (byte_count >= 2)
295+
value = value + (value >> 8);
296+
if constexpr (byte_count >= 3)
297+
value = value + (value >> 16);
298+
299+
} else {
300+
301+
value = popcnt_impl<type_t, byte_count / 2>(value);
302+
value = value + (value >> 4 * byte_count);
303+
}
304+
305+
return value;
306+
}
307+
308+
309+
310+
////////////////////////////////////////////////////////////////////////////////////////////////////
311+
template<typename type_t, std::enable_if_t<is_unsigned_v<type_t>, int>>
312+
constexpr uint_t popcnt(type_t value) noexcept
313+
{
314+
return popcnt_impl<type_t>(value) & ((bit_count_v<type_t> << 2) - 1);
315+
}
316+
317+
318+
319+
////////////////////////////////////////////////////////////////////////////////////////////////////
320+
template<typename type_t, std::enable_if_t<is_unsigned_v<type_t>, int>>
321+
constexpr uint_t nlz(type_t value) noexcept
322+
{
323+
return bit_count_v<type_t> - popcnt(pmsbr(value));
246324
}
247325

248326

@@ -470,7 +548,7 @@ constexpr type_t divr2(type_t value1_hi, type_t value1_lo, type_t value2, std::o
470548

471549
const type_t t1 = quotient_hi * nvalue2_lo;
472550
const type_t t2 = half_make_hi(*remainder_hi) | nvalue1_hi;
473-
551+
474552
if (t1 > t2) {
475553

476554
--quotient_hi;
@@ -483,7 +561,7 @@ constexpr type_t divr2(type_t value1_hi, type_t value1_lo, type_t value2, std::o
483561

484562
std::optional<type_t> remainder_lo = type_t();
485563
type_t quotient_lo = divr(nvalue1_21, nvalue2_hi, remainder_lo);
486-
564+
487565
const type_t t3 = quotient_lo * nvalue2_lo;
488566
const type_t t4 = half_make_hi(*remainder_lo) | nvalue1_lo;
489567

include/slimcpplib/long_math_gcc.h

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,13 @@ namespace slim
4444
// standalone routines
4545
////////////////////////////////////////////////////////////////////////////////////////////////////
4646

47+
// calculate number of set bits
48+
49+
uint_t popcnt(uint8_t value) noexcept;
50+
uint_t popcnt(uint16_t value) noexcept;
51+
uint_t popcnt(uint32_t value) noexcept;
52+
uint_t popcnt(uint64_t value) noexcept;
53+
4754
// calculate leading zero bits
4855

4956
uint_t nlz(uint8_t value) noexcept;
@@ -74,6 +81,38 @@ uint64_t mulc(uint64_t value1, uint64_t value2, uint64_t& carry) noexcept;
7481
// standalone routines
7582
////////////////////////////////////////////////////////////////////////////////////////////////////
7683

84+
inline uint_t popcnt(uint8_t value) noexcept
85+
{
86+
return __builtin_popcount(value);
87+
}
88+
89+
90+
91+
////////////////////////////////////////////////////////////////////////////////////////////////////
92+
inline uint_t popcnt(uint16_t value) noexcept
93+
{
94+
return __builtin_popcount(value);
95+
}
96+
97+
98+
99+
////////////////////////////////////////////////////////////////////////////////////////////////////
100+
inline uint_t popcnt(uint32_t value) noexcept
101+
{
102+
return __builtin_popcount(value);
103+
}
104+
105+
106+
107+
////////////////////////////////////////////////////////////////////////////////////////////////////
108+
inline uint_t popcnt(uint64_t value) noexcept
109+
{
110+
return __builtin_popcount(value);
111+
}
112+
113+
114+
115+
////////////////////////////////////////////////////////////////////////////////////////////////////
77116
inline uint_t nlz(uint8_t value) noexcept
78117
{
79118
return value ? __builtin_clz(value) : bit_count_v<uint8_t>;

include/slimcpplib/long_math_msvc.h

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,13 @@ namespace slim
4444
// standalone routines
4545
////////////////////////////////////////////////////////////////////////////////////////////////////
4646

47+
// calculate number of set bits
48+
49+
uint_t popcnt(uint8_t value) noexcept;
50+
uint_t popcnt(uint16_t value) noexcept;
51+
uint_t popcnt(uint32_t value) noexcept;
52+
uint_t popcnt(uint64_t value) noexcept;
53+
4754
// calculate leading zero bits
4855

4956
uint_t nlz(uint8_t value) noexcept;
@@ -86,6 +93,42 @@ uint64_t mulc(uint64_t value1, uint64_t value2, uint64_t& carry) noexcept;
8693
// standalone routines
8794
////////////////////////////////////////////////////////////////////////////////////////////////////
8895

96+
inline uint_t popcnt(uint8_t value) noexcept
97+
{
98+
return __popcnt16(value);
99+
}
100+
101+
102+
103+
////////////////////////////////////////////////////////////////////////////////////////////////////
104+
inline uint_t popcnt(uint16_t value) noexcept
105+
{
106+
return __popcnt16(value);
107+
}
108+
109+
110+
111+
////////////////////////////////////////////////////////////////////////////////////////////////////
112+
inline uint_t popcnt(uint32_t value) noexcept
113+
{
114+
return __popcnt(value);
115+
}
116+
117+
118+
119+
////////////////////////////////////////////////////////////////////////////////////////////////////
120+
inline uint_t popcnt(uint64_t value) noexcept
121+
{
122+
#ifdef _M_X64
123+
return __popcnt64(value);
124+
#else
125+
return __popcnt(static_cast<uint32_t>(half_hi(value))) + __popcnt(static_cast<uint32_t>(half_lo(value)));
126+
#endif // _M_X64
127+
}
128+
129+
130+
131+
////////////////////////////////////////////////////////////////////////////////////////////////////
89132
inline uint_t nlz(uint8_t value) noexcept
90133
{
91134
unsigned long result = 0;

0 commit comments

Comments
 (0)