-
Notifications
You must be signed in to change notification settings - Fork 14.9k
Open
Description
This is more of a feature request than a bug.
This is a follow-up of #158741, about an 8-bit popcount operation.
#include <stdint.h>
#if defined(__ARM_NEON)
#include <arm_neon.h>
// My version of 8-bit popcount, which makes smaller code than the built-in.
unsigned int popcount_8(uint8_t x) {
// Initialize the vector register. Set all lanes at once so that the
// compiler will not emit instruction to zero-initialize other lanes.
uint8x8_t v = vdup_n_u8(x);
// Count the number of set bits for each lane (8-bit) in the vector.
v = vcnt_u8(v);
// Get lane 0 and discard lanes 1 to 7. (Return type was uint8_t)
return vget_lane_u8(v, 0);
}
#endif
unsigned int popcount_8_b(uint8_t x) {
return (unsigned int)__builtin_popcount(x);
}With Aarch64 target, you can see the compiled code difference:
(clang trunk-20250918)
popcount_8:
fmov s0, w0
cnt v0.8b, v0.8b
umov w0, v0.b[0]
ret
popcount_8_b:
and w8, w0, #0xff
fmov s0, w8
cnt v0.8b, v0.8b
fmov w0, s0
ret(gcc 15.2.0)
popcount_8:
dup v31.8b, w0
cnt v31.8b, v31.8b
umov w0, v31.b[0]
retComparing to the solution currently done in #158741, I can save even a bitwise AND operation. I think this is the smallest code possible for 8-bit popcount in AArch64 and ARMv7+NEON.
Update: I also reported the issue to GCC.