-
Notifications
You must be signed in to change notification settings - Fork 15.2k
Labels
llvm:instcombineCovers the InstCombine, InstSimplify and AggressiveInstCombine passesCovers the InstCombine, InstSimplify and AggressiveInstCombine passesmissed-optimization
Description
Here is a simple loop to find the index of the first different bit in two arrays:
unsigned long crit_bit(unsigned long *a, unsigned long *b, unsigned long size) {
unsigned long ret = 0;
#pragma nounroll
for (unsigned long i = 0; i < size; ++i) {
unsigned long diff = a[i] ^ b[i];
int bits = 8 * sizeof(unsigned long);
ret += __builtin_ctzg(diff, bits);
if (diff) {
break;
}
}
return ret;
}On x86-64 with -O3 -mbmi, the loop generates
.LBB0_4:
mov r9, rax
mov r10, qword ptr [rdi + 8*r8]
mov r11, qword ptr [rsi + 8*r8]
mov rax, r11
xor rax, r10
tzcnt rax, rax
mov rbx, r11
xor rbx, r10
cmove rax, rcx
add rax, r9
xor r11, r10
jne .LBB0_6
lea r9, [r8 + 1]
cmp rdx, r8
mov r8, r9
jne .LBB0_4First, it's using cmove to get 64 in case the input was zero, but tzcnt already returns 64 in that case so that's redundant.
Second, it could use the flags after tzcnt to handle the if (diff) break, but instead it re-does the comparison.
Better code would look something like this:
xor rax, r10
tzcnt rax, rax
lea rax, [rax + r9]
jc .LBB0_6Metadata
Metadata
Assignees
Labels
llvm:instcombineCovers the InstCombine, InstSimplify and AggressiveInstCombine passesCovers the InstCombine, InstSimplify and AggressiveInstCombine passesmissed-optimization