-
Notifications
You must be signed in to change notification settings - Fork 15.3k
Open
Labels
Description
This code:
export fn extract_bits1(a: u64) u64 {
return ((a >> 2) & 0b1) | ((a >> 5) & 0b110);
}This LLVM IR:
define dso_local i64 @extract_bits1(i64 %0) #0 {
1:
%2 = zext i6 2 to i64
%3 = lshr i64 %0, %2
%4 = and i64 %3, 1
%5 = zext i6 5 to i64
%6 = lshr i64 %0, %5
%7 = and i64 %6, 6
%8 = or i64 %4, %7
ret i64 %8
}Gets "optimized" to this LLVM IR when -mcpu=pwr10:
define dso_local i64 @extract_bits1(i64 %0) local_unnamed_addr {
Entry:
%1 = insertelement <2 x i64> poison, i64 %0, i64 0
%2 = shufflevector <2 x i64> %1, <2 x i64> poison, <2 x i32> zeroinitializer
%3 = lshr <2 x i64> %2, <i64 2, i64 5>
%4 = and <2 x i64> %3, <i64 1, i64 6>
%shift = shufflevector <2 x i64> %4, <2 x i64> poison, <2 x i32> <i32 1, i32 poison>
%5 = or <2 x i64> %4, %shift
%6 = extractelement <2 x i64> %5, i64 0
ret i64 %6
}For pwr9, and other architecture/cpu combinations, this does not happen.
This results in this assembly:
.LCPI0_0:
.long 0
.long 2
.long 0
.long 5
.LCPI0_1:
.long 0
.long 1
.long 0
.long 6
extract_bits1:
stwu 1, -64(1)
stw 4, 32(1)
stw 3, 16(1)
xxsplti32dx 34, 0, 66051
li 3, .LCPI0_0@l
lis 4, .LCPI0_0@ha
lxv 35, 32(1)
lxv 36, 16(1)
xxsplti32dx 34, 1, 269554195
vperm 2, 4, 3, 2
lxvx 35, 4, 3
li 3, .LCPI0_1@l
lis 4, .LCPI0_1@ha
lxvx 0, 4, 3
vsrd 2, 2, 3
xxland 1, 34, 0
xxswapd 35, 1
xxeval 0, 35, 34, 0, 31
stxv 0, 48(1)
lwz 3, 48(1)
lwz 4, 52(1)
addi 1, 1, 64
blrCompare that to the pwr9 assembly, which at first glance seems a lot better:
extract_bits1:
stwu 1, -16(1)
rlwinm 5, 4, 30, 31, 31
li 3, 0
rlwimi 5, 4, 27, 29, 30
mr 4, 5
addi 1, 1, 16
blrExpected Behavior
I expect the optimized LLVM IR to be the same as it is for the pwr9 platform.
define dso_local i64 @extract_bits1(i64 %0) local_unnamed_addr {
Entry:
%1 = lshr i64 %0, 2
%2 = and i64 %1, 1
%3 = lshr i64 %0, 5
%4 = and i64 %3, 6
%5 = or i64 %2, %4
ret i64 %5
}Originally ziglang/zig#18381