-
Notifications
You must be signed in to change notification settings - Fork 15.2k
Closed
Description
Zig code:
export fn foo(w: u64, x: u64, y: u64, z: u64) u64 {
var s = w;
s &= ~(x & y);
s &= ~(z & ~y);
return s;
}
export fn fooVec(w: @Vector(16, u8), x: @Vector(16, u8), y: @Vector(16, u8), z: @Vector(16, u8)) @Vector(16, u8) {
var s = w;
s &= ~(x & y);
s &= ~(z & ~y);
return s;
}Compiled for Znver3:
foo:
not rcx
and rsi, rdx
andn rax, rsi, rdi
or rcx, rdx
and rax, rcx
ret
fooVec:
vpand xmm1, xmm2, xmm1
vpcmpeqd xmm4, xmm4, xmm4
vpandn xmm0, xmm1, xmm0
vpxor xmm1, xmm3, xmm4
vpor xmm1, xmm1, xmm2
vpand xmm0, xmm0, xmm1
retI expected it to be and+andn+andn+andn for the scalar version, and vpand+vpandn+vpandn+vpandn for the vector version. However, it turns s &= ~(z & ~y); into s &= ~z | y;. While this makes sense on machines without andn (and doesn't matter for machines which have andn and orn, or just vpternlogq), with x86-64 BMI1 we only get andn. We never got an orn, so this optimization hurts us.
LLVM (optimized):
define dso_local i64 @foo(i64 %0, i64 %1, i64 %2, i64 %3) local_unnamed_addr {
Entry:
%4 = and i64 %2, %1
%5 = xor i64 %4, -1
%6 = and i64 %5, %0
%.not = xor i64 %3, -1
%7 = or i64 %.not, %2
%8 = and i64 %6, %7
ret i64 %8
}
declare void @llvm.dbg.value(metadata, metadata, metadata) #1
define dso_local <16 x i8> @fooVec(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2, <16 x i8> %3) local_unnamed_addr {
Entry:
%4 = and <16 x i8> %2, %1
%5 = xor <16 x i8> %4, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
%6 = and <16 x i8> %5, %0
%.not = xor <16 x i8> %3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
%7 = or <16 x i8> %.not, %2
%8 = and <16 x i8> %6, %7
ret <16 x i8> %8
}