-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[MachineLICM] Let targets decide if copy instructions are cheap #146599
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-x86 @llvm/pr-subscribers-backend-aarch64 Author: Guy David (guy-david) ChangesWhen checking whether it is profitable to hoist an instruction, the pass may override a target's ruling because it assumes that all COPY instructions are cheap, and that may not be the case for all micro-architectures (especially for when copying between different register classes). On AArch64 there's 0% difference in performance in LLVM's test-suite with this change. Additionally, very few tests were affected which shows how it is not so useful to keep it. Patch is 41.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/146599.diff 9 Files Affected:
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index c9079170ca575..70a178f642fb0 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -1219,7 +1219,7 @@ bool MachineLICMImpl::HasHighOperandLatency(MachineInstr &MI, unsigned DefIdx,
/// Return true if the instruction is marked "cheap" or the operand latency
/// between its def and a use is one or less.
bool MachineLICMImpl::IsCheapInstruction(MachineInstr &MI) const {
- if (TII->isAsCheapAsAMove(MI) || MI.isCopyLike())
+ if (TII->isAsCheapAsAMove(MI))
return true;
bool isCheap = false;
diff --git a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
index 53126a08db86f..2bd04ac30509e 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
@@ -18,14 +18,28 @@ define fastcc i8 @allocno_reload_assign(ptr %p) {
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: mvn w8, w8
+; CHECK-NEXT: uunpklo z1.h, z0.b
+; CHECK-NEXT: uunpkhi z2.h, z0.b
; CHECK-NEXT: sbfx x8, x8, #0, #1
; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: uunpklo z3.s, z1.h
+; CHECK-NEXT: uunpkhi z4.s, z1.h
+; CHECK-NEXT: uunpklo z6.s, z2.h
+; CHECK-NEXT: uunpkhi z16.s, z2.h
; CHECK-NEXT: punpklo p1.h, p0.b
; CHECK-NEXT: punpkhi p0.h, p0.b
; CHECK-NEXT: punpklo p2.h, p1.b
; CHECK-NEXT: punpkhi p4.h, p1.b
+; CHECK-NEXT: uunpklo z1.d, z3.s
+; CHECK-NEXT: uunpkhi z2.d, z3.s
; CHECK-NEXT: punpklo p6.h, p0.b
+; CHECK-NEXT: uunpklo z3.d, z4.s
+; CHECK-NEXT: uunpkhi z4.d, z4.s
; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: uunpklo z5.d, z6.s
+; CHECK-NEXT: uunpkhi z6.d, z6.s
+; CHECK-NEXT: uunpklo z7.d, z16.s
+; CHECK-NEXT: uunpkhi z16.d, z16.s
; CHECK-NEXT: punpklo p1.h, p2.b
; CHECK-NEXT: punpkhi p2.h, p2.b
; CHECK-NEXT: punpklo p3.h, p4.b
@@ -35,28 +49,14 @@ define fastcc i8 @allocno_reload_assign(ptr %p) {
; CHECK-NEXT: punpklo p7.h, p0.b
; CHECK-NEXT: punpkhi p0.h, p0.b
; CHECK-NEXT: .LBB0_1: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: uunpklo z1.h, z0.b
-; CHECK-NEXT: uunpklo z2.s, z1.h
-; CHECK-NEXT: uunpkhi z1.s, z1.h
-; CHECK-NEXT: uunpklo z3.d, z2.s
-; CHECK-NEXT: uunpkhi z2.d, z2.s
-; CHECK-NEXT: st1b { z3.d }, p1, [z0.d]
+; CHECK-NEXT: st1b { z1.d }, p1, [z0.d]
; CHECK-NEXT: st1b { z2.d }, p2, [z0.d]
-; CHECK-NEXT: uunpklo z2.d, z1.s
-; CHECK-NEXT: uunpkhi z1.d, z1.s
-; CHECK-NEXT: st1b { z2.d }, p3, [z0.d]
-; CHECK-NEXT: uunpkhi z2.h, z0.b
-; CHECK-NEXT: uunpklo z3.s, z2.h
-; CHECK-NEXT: uunpkhi z2.s, z2.h
-; CHECK-NEXT: st1b { z1.d }, p4, [z0.d]
-; CHECK-NEXT: uunpklo z1.d, z3.s
-; CHECK-NEXT: st1b { z1.d }, p5, [z0.d]
-; CHECK-NEXT: uunpkhi z1.d, z3.s
-; CHECK-NEXT: st1b { z1.d }, p6, [z0.d]
-; CHECK-NEXT: uunpklo z1.d, z2.s
-; CHECK-NEXT: st1b { z1.d }, p7, [z0.d]
-; CHECK-NEXT: uunpkhi z1.d, z2.s
-; CHECK-NEXT: st1b { z1.d }, p0, [z0.d]
+; CHECK-NEXT: st1b { z3.d }, p3, [z0.d]
+; CHECK-NEXT: st1b { z4.d }, p4, [z0.d]
+; CHECK-NEXT: st1b { z5.d }, p5, [z0.d]
+; CHECK-NEXT: st1b { z6.d }, p6, [z0.d]
+; CHECK-NEXT: st1b { z7.d }, p7, [z0.d]
+; CHECK-NEXT: st1b { z16.d }, p0, [z0.d]
; CHECK-NEXT: str p8, [x0]
; CHECK-NEXT: b .LBB0_1
br label %1
diff --git a/llvm/test/CodeGen/PowerPC/vsx-fma-m-early.ll b/llvm/test/CodeGen/PowerPC/vsx-fma-m-early.ll
index 9cb2d4444b974..6cfb8b0e73f7c 100644
--- a/llvm/test/CodeGen/PowerPC/vsx-fma-m-early.ll
+++ b/llvm/test/CodeGen/PowerPC/vsx-fma-m-early.ll
@@ -1,17 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
;; Tests that the ppc-vsx-fma-mutate pass with the schedule-ppc-vsx-fma-mutation-early pass does not hoist xxspltiw out of loops.
; RUN: llc -verify-machineinstrs -mcpu=pwr10 -disable-ppc-vsx-fma-mutation=false \
; RUN: -ppc-asm-full-reg-names -schedule-ppc-vsx-fma-mutation-early \
-; RUN: -mtriple powerpc64-ibm-aix < %s | FileCheck --check-prefixes=CHECK64,AIX64 %s
+; RUN: -mtriple powerpc64-ibm-aix < %s | FileCheck --check-prefixes=AIX64 %s
; RUN: llc -verify-machineinstrs -mcpu=pwr10 -disable-ppc-vsx-fma-mutation=false \
; RUN: -ppc-asm-full-reg-names -schedule-ppc-vsx-fma-mutation-early \
-; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck --check-prefixes=CHECK64,LINUX64 %s
+; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck --check-prefixes=LINUX64 %s
; RUN: llc -verify-machineinstrs -mcpu=pwr10 -disable-ppc-vsx-fma-mutation=false \
; RUN: -ppc-asm-full-reg-names -schedule-ppc-vsx-fma-mutation-early \
; RUN: -mtriple powerpc-ibm-aix < %s | FileCheck --check-prefix=CHECK32 %s
define void @bar(ptr noalias nocapture noundef writeonly %__output_a, ptr noalias nocapture noundef readonly %var1321In_a, ptr noalias nocapture noundef readonly %n) {
+; AIX64-LABEL: bar:
+; AIX64: # %bb.0: # %entry
+; AIX64-NEXT: lwz r5, 0(r5)
+; AIX64-NEXT: cmpwi r5, 1
+; AIX64-NEXT: bltlr cr0
+; AIX64-NEXT: # %bb.1: # %for.body.preheader
+; AIX64-NEXT: xxspltiw vs0, 1069066811
+; AIX64-NEXT: xxspltiw vs1, 1170469888
+; AIX64-NEXT: mtctr r5
+; AIX64-NEXT: li r5, 0
+; AIX64-NEXT: .align 5
+; AIX64-NEXT: L..BB0_2: # %for.body
+; AIX64-NEXT: #
+; AIX64-NEXT: lxvx vs2, r4, r5
+; AIX64-NEXT: xvmaddmsp vs2, vs0, vs1
+; AIX64-NEXT: stxvx vs2, r3, r5
+; AIX64-NEXT: addi r5, r5, 16
+; AIX64-NEXT: bdnz L..BB0_2
+; AIX64-NEXT: # %bb.3: # %for.end
+; AIX64-NEXT: blr
+;
+; LINUX64-LABEL: bar:
+; LINUX64: # %bb.0: # %entry
+; LINUX64-NEXT: lwz r5, 0(r5)
+; LINUX64-NEXT: cmpwi r5, 1
+; LINUX64-NEXT: bltlr cr0
+; LINUX64-NEXT: # %bb.1: # %for.body.preheader
+; LINUX64-NEXT: xxspltiw vs0, 1069066811
+; LINUX64-NEXT: xxspltiw vs1, 1170469888
+; LINUX64-NEXT: mtctr r5
+; LINUX64-NEXT: li r5, 0
+; LINUX64-NEXT: .p2align 5
+; LINUX64-NEXT: .LBB0_2: # %for.body
+; LINUX64-NEXT: #
+; LINUX64-NEXT: lxvx vs2, r4, r5
+; LINUX64-NEXT: xvmaddmsp vs2, vs0, vs1
+; LINUX64-NEXT: stxvx vs2, r3, r5
+; LINUX64-NEXT: addi r5, r5, 16
+; LINUX64-NEXT: bdnz .LBB0_2
+; LINUX64-NEXT: # %bb.3: # %for.end
+; LINUX64-NEXT: blr
+;
+; CHECK32-LABEL: bar:
+; CHECK32: # %bb.0: # %entry
+; CHECK32-NEXT: lwz r5, 0(r5)
+; CHECK32-NEXT: cmpwi r5, 0
+; CHECK32-NEXT: blelr cr0
+; CHECK32-NEXT: # %bb.1: # %for.body.preheader
+; CHECK32-NEXT: xxspltiw vs0, 1069066811
+; CHECK32-NEXT: xxspltiw vs1, 1170469888
+; CHECK32-NEXT: li r6, 0
+; CHECK32-NEXT: li r7, 0
+; CHECK32-NEXT: .align 4
+; CHECK32-NEXT: L..BB0_2: # %for.body
+; CHECK32-NEXT: #
+; CHECK32-NEXT: slwi r8, r7, 4
+; CHECK32-NEXT: addic r7, r7, 1
+; CHECK32-NEXT: addze r6, r6
+; CHECK32-NEXT: lxvx vs2, r4, r8
+; CHECK32-NEXT: xvmaddmsp vs2, vs0, vs1
+; CHECK32-NEXT: stxvx vs2, r3, r8
+; CHECK32-NEXT: xor r8, r7, r5
+; CHECK32-NEXT: or. r8, r8, r6
+; CHECK32-NEXT: bne cr0, L..BB0_2
+; CHECK32-NEXT: # %bb.3: # %for.end
+; CHECK32-NEXT: blr
entry:
%0 = load i32, ptr %n, align 4
%cmp11 = icmp sgt i32 %0, 0
@@ -28,7 +95,7 @@ for.body:
%add.ptr.val = load <4 x float>, ptr %add.ptr, align 1
%2 = tail call contract <4 x float> @llvm.fma.v4f32(<4 x float> %add.ptr.val, <4 x float> <float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000>, <4 x float> <float 6.270500e+03, float 6.270500e+03, float 6.270500e+03, float 6.270500e+03>)
%add.ptr6 = getelementptr inbounds float, ptr %__output_a, i64 %1
- store <4 x float> %2, ptr %add.ptr6, align 1
+ store <4 x float> %2, ptr %add.ptr6, align 1
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond.not, label %for.end, label %for.body
@@ -38,6 +105,74 @@ for.end:
}
define void @foo(i1 %cmp97) #0 {
+; AIX64-LABEL: foo:
+; AIX64: # %bb.0: # %entry
+; AIX64-NEXT: andi. r3, r3, 1
+; AIX64-NEXT: bclr 4, gt, 0
+; AIX64-NEXT: # %bb.1: # %for.body.preheader
+; AIX64-NEXT: xxlxor f0, f0, f0
+; AIX64-NEXT: xxlxor f2, f2, f2
+; AIX64-NEXT: xxmrghd vs1, vs0, vs0
+; AIX64-NEXT: xvcvdpsp vs34, vs1
+; AIX64-NEXT: xxlxor vs1, vs1, vs1
+; AIX64-NEXT: .align 4
+; AIX64-NEXT: L..BB1_2: # %for.body
+; AIX64-NEXT: #
+; AIX64-NEXT: xxmrghd vs2, vs2, vs0
+; AIX64-NEXT: xvcvdpsp vs35, vs2
+; AIX64-NEXT: xxspltiw vs2, 1170469888
+; AIX64-NEXT: vmrgew v3, v3, v2
+; AIX64-NEXT: xvcmpgtsp vs3, vs1, vs35
+; AIX64-NEXT: xvmaddasp vs2, vs35, vs1
+; AIX64-NEXT: xxland vs2, vs3, vs2
+; AIX64-NEXT: xscvspdpn f2, vs2
+; AIX64-NEXT: b L..BB1_2
+;
+; LINUX64-LABEL: foo:
+; LINUX64: # %bb.0: # %entry
+; LINUX64-NEXT: andi. r3, r3, 1
+; LINUX64-NEXT: bclr 4, gt, 0
+; LINUX64-NEXT: # %bb.1: # %for.body.preheader
+; LINUX64-NEXT: xxlxor f0, f0, f0
+; LINUX64-NEXT: xxlxor f2, f2, f2
+; LINUX64-NEXT: xxspltd vs1, vs0, 0
+; LINUX64-NEXT: xvcvdpsp vs34, vs1
+; LINUX64-NEXT: xxlxor vs1, vs1, vs1
+; LINUX64-NEXT: .p2align 4
+; LINUX64-NEXT: .LBB1_2: # %for.body
+; LINUX64-NEXT: #
+; LINUX64-NEXT: xxmrghd vs2, vs0, vs2
+; LINUX64-NEXT: xvcvdpsp vs35, vs2
+; LINUX64-NEXT: xxspltiw vs2, 1170469888
+; LINUX64-NEXT: vmrgew v3, v2, v3
+; LINUX64-NEXT: xvcmpgtsp vs3, vs1, vs35
+; LINUX64-NEXT: xvmaddasp vs2, vs35, vs1
+; LINUX64-NEXT: xxland vs2, vs3, vs2
+; LINUX64-NEXT: xxsldwi vs2, vs2, vs2, 3
+; LINUX64-NEXT: xscvspdpn f2, vs2
+; LINUX64-NEXT: b .LBB1_2
+;
+; CHECK32-LABEL: foo:
+; CHECK32: # %bb.0: # %entry
+; CHECK32-NEXT: andi. r3, r3, 1
+; CHECK32-NEXT: bclr 4, gt, 0
+; CHECK32-NEXT: # %bb.1: # %for.body.preheader
+; CHECK32-NEXT: lwz r3, L..C0(r2) # %const.0
+; CHECK32-NEXT: xxlxor f1, f1, f1
+; CHECK32-NEXT: xxlxor vs0, vs0, vs0
+; CHECK32-NEXT: xscvdpspn vs35, f1
+; CHECK32-NEXT: lxv vs34, 0(r3)
+; CHECK32-NEXT: .align 4
+; CHECK32-NEXT: L..BB1_2: # %for.body
+; CHECK32-NEXT: #
+; CHECK32-NEXT: xscvdpspn vs36, f1
+; CHECK32-NEXT: xxspltiw vs1, 1170469888
+; CHECK32-NEXT: vperm v4, v4, v3, v2
+; CHECK32-NEXT: xvcmpgtsp vs2, vs0, vs36
+; CHECK32-NEXT: xvmaddasp vs1, vs36, vs0
+; CHECK32-NEXT: xxland vs1, vs2, vs1
+; CHECK32-NEXT: xscvspdpn f1, vs1
+; CHECK32-NEXT: b L..BB1_2
entry:
br i1 %cmp97, label %for.body, label %for.end
@@ -57,122 +192,7 @@ for.end: ; preds = %entry
}
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
declare <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float>, <4 x float>)
-
-; CHECK64: bar:
-; CHECK64: # %bb.0: # %entry
-; CHECK64-NEXT: lwz r5, 0(r5)
-; CHECK64-NEXT: cmpwi r5, 1
-; CHECK64-NEXT: bltlr cr0
-; CHECK64-NEXT: # %bb.1: # %for.body.preheader
-; CHECK64-NEXT: xxspltiw vs0, 1069066811
-; CHECK64-NEXT: xxspltiw vs1, 1170469888
-; CHECK64-NEXT: mtctr r5
-; CHECK64-NEXT: li r5, 0
-; CHECK64-NEXT: {{.*}}align 5
-; CHECK64-NEXT: [[L2_bar:.*]]: # %for.body
-; CHECK64-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK64-NEXT: lxvx vs2, r4, r5
-; CHECK64-NEXT: xvmaddmsp vs2, vs0, vs1
-; CHECK64-NEXT: stxvx vs2, r3, r5
-; CHECK64-NEXT: addi r5, r5, 16
-; CHECK64-NEXT: bdnz [[L2_bar]]
-; CHECK64-NEXT: # %bb.3: # %for.end
-; CHECK64-NEXT: blr
-
-; AIX64: .foo:
-; AIX64-NEXT: # %bb.0: # %entry
-; AIX64-NEXT: andi. r3, r3, 1
-; AIX64-NEXT: bclr 4, gt, 0
-; AIX64-NEXT: # %bb.1: # %for.body.preheader
-; AIX64-NEXT: xxlxor f0, f0, f0
-; AIX64-NEXT: xxlxor vs1, vs1, vs1
-; AIX64-NEXT: xxlxor f2, f2, f2
-; AIX64-NEXT: .align 4
-; AIX64-NEXT: L..BB1_2: # %for.body
-; AIX64-NEXT: # =>This Inner Loop Header: Depth=1
-; AIX64-NEXT: xxmrghd vs2, vs2, vs0
-; AIX64-NEXT: xvcvdpsp vs34, vs2
-; AIX64-NEXT: xxmrghd vs2, vs0, vs0
-; AIX64-NEXT: xvcvdpsp vs35, vs2
-; AIX64-NEXT: xxspltiw vs2, 1170469888
-; AIX64-NEXT: vmrgew v2, v2, v3
-; AIX64-NEXT: xvcmpgtsp vs3, vs1, vs34
-; AIX64-NEXT: xvmaddasp vs2, vs34, vs1
-; AIX64-NEXT: xxland vs2, vs3, vs2
-; AIX64-NEXT: xscvspdpn f2, vs2
-; AIX64-NEXT: b L..BB1_2
-
-; LINUX64: foo: # @foo
-; LINUX64-NEXT: .Lfunc_begin1:
-; LINUX64-NEXT: .cfi_startproc
-; LINUX64-NEXT: # %bb.0: # %entry
-; LINUX64-NEXT: andi. r3, r3, 1
-; LINUX64-NEXT: bclr 4, gt, 0
-; LINUX64-NEXT: # %bb.1: # %for.body.preheader
-; LINUX64-NEXT: xxlxor f0, f0, f0
-; LINUX64-NEXT: xxlxor vs1, vs1, vs1
-; LINUX64-NEXT: xxlxor f2, f2, f2
-; LINUX64-NEXT: .p2align 4
-; LINUX64-NEXT: .LBB1_2: # %for.body
-; LINUX64-NEXT: # =>This Inner Loop Header: Depth=1
-; LINUX64-NEXT: xxmrghd vs2, vs0, vs2
-; LINUX64-NEXT: xvcvdpsp vs34, vs2
-; LINUX64-NEXT: xxspltd vs2, vs0, 0
-; LINUX64-NEXT: xvcvdpsp vs35, vs2
-; LINUX64-NEXT: xxspltiw vs2, 1170469888
-; LINUX64-NEXT: vmrgew v2, v3, v2
-; LINUX64-NEXT: xvcmpgtsp vs3, vs1, vs34
-; LINUX64-NEXT: xvmaddasp vs2, vs34, vs1
-; LINUX64-NEXT: xxland vs2, vs3, vs2
-; LINUX64-NEXT: xxsldwi vs2, vs2, vs2, 3
-; LINUX64-NEXT: xscvspdpn f2, vs2
-; LINUX64-NEXT: b .LBB1_2
-
-; CHECK32: .bar:
-; CHECK32-NEXT: # %bb.0: # %entry
-; CHECK32-NEXT: lwz r5, 0(r5)
-; CHECK32-NEXT: cmpwi r5, 0
-; CHECK32-NEXT: blelr cr0
-; CHECK32-NEXT: # %bb.1: # %for.body.preheader
-; CHECK32-NEXT: xxspltiw vs0, 1069066811
-; CHECK32-NEXT: xxspltiw vs1, 1170469888
-; CHECK32-NEXT: li r6, 0
-; CHECK32-NEXT: li r7, 0
-; CHECK32-NEXT: .align 4
-; CHECK32-NEXT: [[L2_foo:.*]]: # %for.body
-; CHECK32-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK32-NEXT: slwi r8, r7, 4
-; CHECK32-NEXT: addic r7, r7, 1
-; CHECK32-NEXT: addze r6, r6
-; CHECK32-NEXT: lxvx vs2, r4, r8
-; CHECK32-NEXT: xvmaddmsp vs2, vs0, vs1
-; CHECK32-NEXT: stxvx vs2, r3, r8
-; CHECK32-NEXT: xor r8, r7, r5
-; CHECK32-NEXT: or. r8, r8, r6
-; CHECK32-NEXT: bne cr0, [[L2_foo]]
-
-; CHECK32: .foo:
-; CHECK32-NEXT: # %bb.0: # %entry
-; CHECK32-NEXT: andi. r3, r3, 1
-; CHECK32-NEXT: bclr 4, gt, 0
-; CHECK32-NEXT: # %bb.1: # %for.body.preheader
-; CHECK32-NEXT: lwz r3, L..C0(r2) # %const.0
-; CHECK32-NEXT: xxlxor f1, f1, f1
-; CHECK32-NEXT: xxlxor vs0, vs0, vs0
-; CHECK32-NEXT: xscvdpspn vs35, f1
-; CHECK32-NEXT: lxv vs34, 0(r3)
-; CHECK32-NEXT: .align 4
-; CHECK32-NEXT: L..BB1_2: # %for.body
-; CHECK32-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK32-NEXT: xscvdpspn vs36, f1
-; CHECK32-NEXT: xxspltiw vs1, 1170469888
-; CHECK32-NEXT: vperm v4, v4, v3, v2
-; CHECK32-NEXT: xvcmpgtsp vs2, vs0, vs36
-; CHECK32-NEXT: xvmaddasp vs1, vs36, vs0
-; CHECK32-NEXT: xxland vs1, vs2, vs1
-; CHECK32-NEXT: xscvspdpn f1, vs1
-; CHECK32-NEXT: b L..BB1_2
diff --git a/llvm/test/CodeGen/X86/break-false-dep.ll b/llvm/test/CodeGen/X86/break-false-dep.ll
index 6943622fac7f2..a6ad3018e052c 100644
--- a/llvm/test/CodeGen/X86/break-false-dep.ll
+++ b/llvm/test/CodeGen/X86/break-false-dep.ll
@@ -472,17 +472,17 @@ define dso_local void @loopdep3() {
; SSE-WIN-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill
; SSE-WIN-NEXT: .seh_savexmm %xmm6, 0
; SSE-WIN-NEXT: .seh_endprologue
-; SSE-WIN-NEXT: xorl %eax, %eax
-; SSE-WIN-NEXT: leaq v(%rip), %rcx
-; SSE-WIN-NEXT: leaq x(%rip), %rdx
-; SSE-WIN-NEXT: leaq y(%rip), %r8
-; SSE-WIN-NEXT: leaq z(%rip), %r9
-; SSE-WIN-NEXT: leaq w(%rip), %r10
+; SSE-WIN-NEXT: leaq v(%rip), %rax
+; SSE-WIN-NEXT: leaq x(%rip), %rcx
+; SSE-WIN-NEXT: leaq y(%rip), %rdx
+; SSE-WIN-NEXT: leaq z(%rip), %r8
+; SSE-WIN-NEXT: leaq w(%rip), %r9
+; SSE-WIN-NEXT: xorl %r10d, %r10d
; SSE-WIN-NEXT: .p2align 4
; SSE-WIN-NEXT: .LBB8_1: # %for.cond1.preheader
; SSE-WIN-NEXT: # =>This Loop Header: Depth=1
; SSE-WIN-NEXT: # Child Loop BB8_2 Depth 2
-; SSE-WIN-NEXT: movq %rcx, %r11
+; SSE-WIN-NEXT: movq %rax, %r11
; SSE-WIN-NEXT: xorl %esi, %esi
; SSE-WIN-NEXT: .p2align 4
; SSE-WIN-NEXT: .LBB8_2: # %for.body3
@@ -490,10 +490,10 @@ define dso_local void @loopdep3() {
; SSE-WIN-NEXT: # => This Inner Loop Header: Depth=2
; SSE-WIN-NEXT: xorps %xmm0, %xmm0
; SSE-WIN-NEXT: cvtsi2sdl (%r11), %xmm0
+; SSE-WIN-NEXT: mulsd (%rsi,%rcx), %xmm0
; SSE-WIN-NEXT: mulsd (%rsi,%rdx), %xmm0
; SSE-WIN-NEXT: mulsd (%rsi,%r8), %xmm0
-; SSE-WIN-NEXT: mulsd (%rsi,%r9), %xmm0
-; SSE-WIN-NEXT: movsd %xmm0, (%rsi,%r10)
+; SSE-WIN-NEXT: movsd %xmm0, (%rsi,%r9)
; SSE-WIN-NEXT: #APP
; SSE-WIN-NEXT: #NO_APP
; SSE-WIN-NEXT: addq $8, %rsi
@@ -502,8 +502,8 @@ define dso_local void @loopdep3() {
; SSE-WIN-NEXT: jne .LBB8_2
; SSE-WIN-NEXT: # %bb.3: # %for.inc14
; SSE-WIN-NEXT: # in Loop: Header=BB8_1 Depth=1
-; SSE-WIN-NEXT: incl %eax
-; SSE-WIN-NEXT: cmpl $100000, %eax # imm = 0x186A0
+; SSE-WIN-NEXT: incl %r10d
+; SSE-WIN-NEXT: cmpl $100000, %r10d # imm = 0x186A0
; SSE-WIN-NEXT: jne .LBB8_1
; SSE-WIN-NEXT: # %bb.4: # %for.end16
; SSE-WIN-NEXT: movaps (%rsp), %xmm6 # 16-byte Reload
@@ -550,17 +550,17 @@ define dso_local void @loopdep3() {
; AVX-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill
; AVX-NEXT: .seh_savexmm %xmm6, 0
; AVX-NEXT: .seh_endprologue
-; AVX-NEXT: xorl %eax, %eax
-; AVX-NEXT: leaq v(%rip), %rcx
-; AVX-NEXT: leaq x(%rip), %rdx
-; AVX-NEXT: leaq y(%rip), %r8
-; AVX-NEXT: leaq z(%rip), %r9
-; AVX-NEXT: leaq w(%rip), %r10
+; AVX-NEXT: leaq v(%rip), %rax
+; AVX-NEXT: leaq x(%rip), %rcx
+; AVX-NEXT: leaq y(%rip), %rdx
+; AVX-NEXT: leaq z(%rip), %r8
+; AVX-NEXT: leaq w(%rip), %r9
+; AVX-NEXT: xorl %r10d, %r10d
; AVX-NEXT: .p2align 4
; AVX-NEXT: .LBB8_1: # %for.cond1.preheader
; AVX-NEXT: # =>This Loop Header: Depth=1
; AVX-NEXT: # Child Loop BB8_2 Depth 2
-; AVX-NEXT: movq %rcx, %r11
+; AVX-NEXT: movq %rax, %r11
; AVX-NEXT: xorl %esi, %esi
; AVX-NEXT: .p2align 4
; AVX-NEXT: .LBB8_2: # %for.body3
@@ -568,10 +568,10 @@ define dso_local void @loopdep3() {
; AVX-NEXT: # => This Inner Loop Header: Depth=2
; AVX-NEXT: vxorps %xmm5, %xmm5, %xmm5
; AVX-NEXT: vcvtsi2sdl (%r11), %xmm5, %xmm0
+; AVX-NEXT: vmulsd (%rsi,%rcx), %xmm0, %xmm0
; AVX-NEXT: vmulsd (%rsi,%rdx), %xmm0, %xmm0
; AVX-NEXT: vmulsd (%rsi,%r8), %xmm0, %xmm0
-; AVX-NEXT: vmulsd (%rsi,%r9), %xmm0, %xmm0
-; AVX-NEXT: vmovsd %xmm0, (%rsi,%r10)
+; AVX-NEXT: vmovsd %xmm0, (%rsi,%r9)
; AVX-NEXT: #APP
; AVX-NEXT: #NO_APP
; AVX-NEXT: addq $8, %rsi
@@ -580,8 +580,8 @@ define dso_local void @loopdep3() {
; AVX-NEXT: jne .LBB8_2
; AVX-NEXT: # %bb.3: # %for.inc14
; AVX-NEXT: # in Loop: Header=BB8_1 Depth=1
-; AVX-NEXT: incl %eax
-; AVX-NEXT: cmpl $100000, %eax # imm = 0x186A0
+; AVX-NEXT: incl %r10d
+; AVX-NEXT: cmpl $100000, %r10d # imm = 0x186A0
; AVX-NEXT: jne .LBB8_1
; AVX-NEXT: ...
[truncated]
|
|
@llvm/pr-subscribers-backend-powerpc Author: Guy David (guy-david) ChangesWhen checking whether it is profitable to hoist an instruction, the pass may override a target's ruling because it assumes that all COPY instructions are cheap, and that may not be the case for all micro-architectures (especially for when copying between different register classes). On AArch64 there's 0% difference in performance in LLVM's test-suite with this change. Additionally, very few tests were affected which shows how it is not so useful to keep it. Patch is 41.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/146599.diff 9 Files Affected:
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index c9079170ca575..70a178f642fb0 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -1219,7 +1219,7 @@ bool MachineLICMImpl::HasHighOperandLatency(MachineInstr &MI, unsigned DefIdx,
/// Return true if the instruction is marked "cheap" or the operand latency
/// between its def and a use is one or less.
bool MachineLICMImpl::IsCheapInstruction(MachineInstr &MI) const {
- if (TII->isAsCheapAsAMove(MI) || MI.isCopyLike())
+ if (TII->isAsCheapAsAMove(MI))
return true;
bool isCheap = false;
diff --git a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
index 53126a08db86f..2bd04ac30509e 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
@@ -18,14 +18,28 @@ define fastcc i8 @allocno_reload_assign(ptr %p) {
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: mvn w8, w8
+; CHECK-NEXT: uunpklo z1.h, z0.b
+; CHECK-NEXT: uunpkhi z2.h, z0.b
; CHECK-NEXT: sbfx x8, x8, #0, #1
; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: uunpklo z3.s, z1.h
+; CHECK-NEXT: uunpkhi z4.s, z1.h
+; CHECK-NEXT: uunpklo z6.s, z2.h
+; CHECK-NEXT: uunpkhi z16.s, z2.h
; CHECK-NEXT: punpklo p1.h, p0.b
; CHECK-NEXT: punpkhi p0.h, p0.b
; CHECK-NEXT: punpklo p2.h, p1.b
; CHECK-NEXT: punpkhi p4.h, p1.b
+; CHECK-NEXT: uunpklo z1.d, z3.s
+; CHECK-NEXT: uunpkhi z2.d, z3.s
; CHECK-NEXT: punpklo p6.h, p0.b
+; CHECK-NEXT: uunpklo z3.d, z4.s
+; CHECK-NEXT: uunpkhi z4.d, z4.s
; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: uunpklo z5.d, z6.s
+; CHECK-NEXT: uunpkhi z6.d, z6.s
+; CHECK-NEXT: uunpklo z7.d, z16.s
+; CHECK-NEXT: uunpkhi z16.d, z16.s
; CHECK-NEXT: punpklo p1.h, p2.b
; CHECK-NEXT: punpkhi p2.h, p2.b
; CHECK-NEXT: punpklo p3.h, p4.b
@@ -35,28 +49,14 @@ define fastcc i8 @allocno_reload_assign(ptr %p) {
; CHECK-NEXT: punpklo p7.h, p0.b
; CHECK-NEXT: punpkhi p0.h, p0.b
; CHECK-NEXT: .LBB0_1: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: uunpklo z1.h, z0.b
-; CHECK-NEXT: uunpklo z2.s, z1.h
-; CHECK-NEXT: uunpkhi z1.s, z1.h
-; CHECK-NEXT: uunpklo z3.d, z2.s
-; CHECK-NEXT: uunpkhi z2.d, z2.s
-; CHECK-NEXT: st1b { z3.d }, p1, [z0.d]
+; CHECK-NEXT: st1b { z1.d }, p1, [z0.d]
; CHECK-NEXT: st1b { z2.d }, p2, [z0.d]
-; CHECK-NEXT: uunpklo z2.d, z1.s
-; CHECK-NEXT: uunpkhi z1.d, z1.s
-; CHECK-NEXT: st1b { z2.d }, p3, [z0.d]
-; CHECK-NEXT: uunpkhi z2.h, z0.b
-; CHECK-NEXT: uunpklo z3.s, z2.h
-; CHECK-NEXT: uunpkhi z2.s, z2.h
-; CHECK-NEXT: st1b { z1.d }, p4, [z0.d]
-; CHECK-NEXT: uunpklo z1.d, z3.s
-; CHECK-NEXT: st1b { z1.d }, p5, [z0.d]
-; CHECK-NEXT: uunpkhi z1.d, z3.s
-; CHECK-NEXT: st1b { z1.d }, p6, [z0.d]
-; CHECK-NEXT: uunpklo z1.d, z2.s
-; CHECK-NEXT: st1b { z1.d }, p7, [z0.d]
-; CHECK-NEXT: uunpkhi z1.d, z2.s
-; CHECK-NEXT: st1b { z1.d }, p0, [z0.d]
+; CHECK-NEXT: st1b { z3.d }, p3, [z0.d]
+; CHECK-NEXT: st1b { z4.d }, p4, [z0.d]
+; CHECK-NEXT: st1b { z5.d }, p5, [z0.d]
+; CHECK-NEXT: st1b { z6.d }, p6, [z0.d]
+; CHECK-NEXT: st1b { z7.d }, p7, [z0.d]
+; CHECK-NEXT: st1b { z16.d }, p0, [z0.d]
; CHECK-NEXT: str p8, [x0]
; CHECK-NEXT: b .LBB0_1
br label %1
diff --git a/llvm/test/CodeGen/PowerPC/vsx-fma-m-early.ll b/llvm/test/CodeGen/PowerPC/vsx-fma-m-early.ll
index 9cb2d4444b974..6cfb8b0e73f7c 100644
--- a/llvm/test/CodeGen/PowerPC/vsx-fma-m-early.ll
+++ b/llvm/test/CodeGen/PowerPC/vsx-fma-m-early.ll
@@ -1,17 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
;; Tests that the ppc-vsx-fma-mutate pass with the schedule-ppc-vsx-fma-mutation-early pass does not hoist xxspltiw out of loops.
; RUN: llc -verify-machineinstrs -mcpu=pwr10 -disable-ppc-vsx-fma-mutation=false \
; RUN: -ppc-asm-full-reg-names -schedule-ppc-vsx-fma-mutation-early \
-; RUN: -mtriple powerpc64-ibm-aix < %s | FileCheck --check-prefixes=CHECK64,AIX64 %s
+; RUN: -mtriple powerpc64-ibm-aix < %s | FileCheck --check-prefixes=AIX64 %s
; RUN: llc -verify-machineinstrs -mcpu=pwr10 -disable-ppc-vsx-fma-mutation=false \
; RUN: -ppc-asm-full-reg-names -schedule-ppc-vsx-fma-mutation-early \
-; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck --check-prefixes=CHECK64,LINUX64 %s
+; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck --check-prefixes=LINUX64 %s
; RUN: llc -verify-machineinstrs -mcpu=pwr10 -disable-ppc-vsx-fma-mutation=false \
; RUN: -ppc-asm-full-reg-names -schedule-ppc-vsx-fma-mutation-early \
; RUN: -mtriple powerpc-ibm-aix < %s | FileCheck --check-prefix=CHECK32 %s
define void @bar(ptr noalias nocapture noundef writeonly %__output_a, ptr noalias nocapture noundef readonly %var1321In_a, ptr noalias nocapture noundef readonly %n) {
+; AIX64-LABEL: bar:
+; AIX64: # %bb.0: # %entry
+; AIX64-NEXT: lwz r5, 0(r5)
+; AIX64-NEXT: cmpwi r5, 1
+; AIX64-NEXT: bltlr cr0
+; AIX64-NEXT: # %bb.1: # %for.body.preheader
+; AIX64-NEXT: xxspltiw vs0, 1069066811
+; AIX64-NEXT: xxspltiw vs1, 1170469888
+; AIX64-NEXT: mtctr r5
+; AIX64-NEXT: li r5, 0
+; AIX64-NEXT: .align 5
+; AIX64-NEXT: L..BB0_2: # %for.body
+; AIX64-NEXT: #
+; AIX64-NEXT: lxvx vs2, r4, r5
+; AIX64-NEXT: xvmaddmsp vs2, vs0, vs1
+; AIX64-NEXT: stxvx vs2, r3, r5
+; AIX64-NEXT: addi r5, r5, 16
+; AIX64-NEXT: bdnz L..BB0_2
+; AIX64-NEXT: # %bb.3: # %for.end
+; AIX64-NEXT: blr
+;
+; LINUX64-LABEL: bar:
+; LINUX64: # %bb.0: # %entry
+; LINUX64-NEXT: lwz r5, 0(r5)
+; LINUX64-NEXT: cmpwi r5, 1
+; LINUX64-NEXT: bltlr cr0
+; LINUX64-NEXT: # %bb.1: # %for.body.preheader
+; LINUX64-NEXT: xxspltiw vs0, 1069066811
+; LINUX64-NEXT: xxspltiw vs1, 1170469888
+; LINUX64-NEXT: mtctr r5
+; LINUX64-NEXT: li r5, 0
+; LINUX64-NEXT: .p2align 5
+; LINUX64-NEXT: .LBB0_2: # %for.body
+; LINUX64-NEXT: #
+; LINUX64-NEXT: lxvx vs2, r4, r5
+; LINUX64-NEXT: xvmaddmsp vs2, vs0, vs1
+; LINUX64-NEXT: stxvx vs2, r3, r5
+; LINUX64-NEXT: addi r5, r5, 16
+; LINUX64-NEXT: bdnz .LBB0_2
+; LINUX64-NEXT: # %bb.3: # %for.end
+; LINUX64-NEXT: blr
+;
+; CHECK32-LABEL: bar:
+; CHECK32: # %bb.0: # %entry
+; CHECK32-NEXT: lwz r5, 0(r5)
+; CHECK32-NEXT: cmpwi r5, 0
+; CHECK32-NEXT: blelr cr0
+; CHECK32-NEXT: # %bb.1: # %for.body.preheader
+; CHECK32-NEXT: xxspltiw vs0, 1069066811
+; CHECK32-NEXT: xxspltiw vs1, 1170469888
+; CHECK32-NEXT: li r6, 0
+; CHECK32-NEXT: li r7, 0
+; CHECK32-NEXT: .align 4
+; CHECK32-NEXT: L..BB0_2: # %for.body
+; CHECK32-NEXT: #
+; CHECK32-NEXT: slwi r8, r7, 4
+; CHECK32-NEXT: addic r7, r7, 1
+; CHECK32-NEXT: addze r6, r6
+; CHECK32-NEXT: lxvx vs2, r4, r8
+; CHECK32-NEXT: xvmaddmsp vs2, vs0, vs1
+; CHECK32-NEXT: stxvx vs2, r3, r8
+; CHECK32-NEXT: xor r8, r7, r5
+; CHECK32-NEXT: or. r8, r8, r6
+; CHECK32-NEXT: bne cr0, L..BB0_2
+; CHECK32-NEXT: # %bb.3: # %for.end
+; CHECK32-NEXT: blr
entry:
%0 = load i32, ptr %n, align 4
%cmp11 = icmp sgt i32 %0, 0
@@ -28,7 +95,7 @@ for.body:
%add.ptr.val = load <4 x float>, ptr %add.ptr, align 1
%2 = tail call contract <4 x float> @llvm.fma.v4f32(<4 x float> %add.ptr.val, <4 x float> <float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000>, <4 x float> <float 6.270500e+03, float 6.270500e+03, float 6.270500e+03, float 6.270500e+03>)
%add.ptr6 = getelementptr inbounds float, ptr %__output_a, i64 %1
- store <4 x float> %2, ptr %add.ptr6, align 1
+ store <4 x float> %2, ptr %add.ptr6, align 1
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond.not, label %for.end, label %for.body
@@ -38,6 +105,74 @@ for.end:
}
define void @foo(i1 %cmp97) #0 {
+; AIX64-LABEL: foo:
+; AIX64: # %bb.0: # %entry
+; AIX64-NEXT: andi. r3, r3, 1
+; AIX64-NEXT: bclr 4, gt, 0
+; AIX64-NEXT: # %bb.1: # %for.body.preheader
+; AIX64-NEXT: xxlxor f0, f0, f0
+; AIX64-NEXT: xxlxor f2, f2, f2
+; AIX64-NEXT: xxmrghd vs1, vs0, vs0
+; AIX64-NEXT: xvcvdpsp vs34, vs1
+; AIX64-NEXT: xxlxor vs1, vs1, vs1
+; AIX64-NEXT: .align 4
+; AIX64-NEXT: L..BB1_2: # %for.body
+; AIX64-NEXT: #
+; AIX64-NEXT: xxmrghd vs2, vs2, vs0
+; AIX64-NEXT: xvcvdpsp vs35, vs2
+; AIX64-NEXT: xxspltiw vs2, 1170469888
+; AIX64-NEXT: vmrgew v3, v3, v2
+; AIX64-NEXT: xvcmpgtsp vs3, vs1, vs35
+; AIX64-NEXT: xvmaddasp vs2, vs35, vs1
+; AIX64-NEXT: xxland vs2, vs3, vs2
+; AIX64-NEXT: xscvspdpn f2, vs2
+; AIX64-NEXT: b L..BB1_2
+;
+; LINUX64-LABEL: foo:
+; LINUX64: # %bb.0: # %entry
+; LINUX64-NEXT: andi. r3, r3, 1
+; LINUX64-NEXT: bclr 4, gt, 0
+; LINUX64-NEXT: # %bb.1: # %for.body.preheader
+; LINUX64-NEXT: xxlxor f0, f0, f0
+; LINUX64-NEXT: xxlxor f2, f2, f2
+; LINUX64-NEXT: xxspltd vs1, vs0, 0
+; LINUX64-NEXT: xvcvdpsp vs34, vs1
+; LINUX64-NEXT: xxlxor vs1, vs1, vs1
+; LINUX64-NEXT: .p2align 4
+; LINUX64-NEXT: .LBB1_2: # %for.body
+; LINUX64-NEXT: #
+; LINUX64-NEXT: xxmrghd vs2, vs0, vs2
+; LINUX64-NEXT: xvcvdpsp vs35, vs2
+; LINUX64-NEXT: xxspltiw vs2, 1170469888
+; LINUX64-NEXT: vmrgew v3, v2, v3
+; LINUX64-NEXT: xvcmpgtsp vs3, vs1, vs35
+; LINUX64-NEXT: xvmaddasp vs2, vs35, vs1
+; LINUX64-NEXT: xxland vs2, vs3, vs2
+; LINUX64-NEXT: xxsldwi vs2, vs2, vs2, 3
+; LINUX64-NEXT: xscvspdpn f2, vs2
+; LINUX64-NEXT: b .LBB1_2
+;
+; CHECK32-LABEL: foo:
+; CHECK32: # %bb.0: # %entry
+; CHECK32-NEXT: andi. r3, r3, 1
+; CHECK32-NEXT: bclr 4, gt, 0
+; CHECK32-NEXT: # %bb.1: # %for.body.preheader
+; CHECK32-NEXT: lwz r3, L..C0(r2) # %const.0
+; CHECK32-NEXT: xxlxor f1, f1, f1
+; CHECK32-NEXT: xxlxor vs0, vs0, vs0
+; CHECK32-NEXT: xscvdpspn vs35, f1
+; CHECK32-NEXT: lxv vs34, 0(r3)
+; CHECK32-NEXT: .align 4
+; CHECK32-NEXT: L..BB1_2: # %for.body
+; CHECK32-NEXT: #
+; CHECK32-NEXT: xscvdpspn vs36, f1
+; CHECK32-NEXT: xxspltiw vs1, 1170469888
+; CHECK32-NEXT: vperm v4, v4, v3, v2
+; CHECK32-NEXT: xvcmpgtsp vs2, vs0, vs36
+; CHECK32-NEXT: xvmaddasp vs1, vs36, vs0
+; CHECK32-NEXT: xxland vs1, vs2, vs1
+; CHECK32-NEXT: xscvspdpn f1, vs1
+; CHECK32-NEXT: b L..BB1_2
entry:
br i1 %cmp97, label %for.body, label %for.end
@@ -57,122 +192,7 @@ for.end: ; preds = %entry
}
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
declare <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float>, <4 x float>)
-
-; CHECK64: bar:
-; CHECK64: # %bb.0: # %entry
-; CHECK64-NEXT: lwz r5, 0(r5)
-; CHECK64-NEXT: cmpwi r5, 1
-; CHECK64-NEXT: bltlr cr0
-; CHECK64-NEXT: # %bb.1: # %for.body.preheader
-; CHECK64-NEXT: xxspltiw vs0, 1069066811
-; CHECK64-NEXT: xxspltiw vs1, 1170469888
-; CHECK64-NEXT: mtctr r5
-; CHECK64-NEXT: li r5, 0
-; CHECK64-NEXT: {{.*}}align 5
-; CHECK64-NEXT: [[L2_bar:.*]]: # %for.body
-; CHECK64-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK64-NEXT: lxvx vs2, r4, r5
-; CHECK64-NEXT: xvmaddmsp vs2, vs0, vs1
-; CHECK64-NEXT: stxvx vs2, r3, r5
-; CHECK64-NEXT: addi r5, r5, 16
-; CHECK64-NEXT: bdnz [[L2_bar]]
-; CHECK64-NEXT: # %bb.3: # %for.end
-; CHECK64-NEXT: blr
-
-; AIX64: .foo:
-; AIX64-NEXT: # %bb.0: # %entry
-; AIX64-NEXT: andi. r3, r3, 1
-; AIX64-NEXT: bclr 4, gt, 0
-; AIX64-NEXT: # %bb.1: # %for.body.preheader
-; AIX64-NEXT: xxlxor f0, f0, f0
-; AIX64-NEXT: xxlxor vs1, vs1, vs1
-; AIX64-NEXT: xxlxor f2, f2, f2
-; AIX64-NEXT: .align 4
-; AIX64-NEXT: L..BB1_2: # %for.body
-; AIX64-NEXT: # =>This Inner Loop Header: Depth=1
-; AIX64-NEXT: xxmrghd vs2, vs2, vs0
-; AIX64-NEXT: xvcvdpsp vs34, vs2
-; AIX64-NEXT: xxmrghd vs2, vs0, vs0
-; AIX64-NEXT: xvcvdpsp vs35, vs2
-; AIX64-NEXT: xxspltiw vs2, 1170469888
-; AIX64-NEXT: vmrgew v2, v2, v3
-; AIX64-NEXT: xvcmpgtsp vs3, vs1, vs34
-; AIX64-NEXT: xvmaddasp vs2, vs34, vs1
-; AIX64-NEXT: xxland vs2, vs3, vs2
-; AIX64-NEXT: xscvspdpn f2, vs2
-; AIX64-NEXT: b L..BB1_2
-
-; LINUX64: foo: # @foo
-; LINUX64-NEXT: .Lfunc_begin1:
-; LINUX64-NEXT: .cfi_startproc
-; LINUX64-NEXT: # %bb.0: # %entry
-; LINUX64-NEXT: andi. r3, r3, 1
-; LINUX64-NEXT: bclr 4, gt, 0
-; LINUX64-NEXT: # %bb.1: # %for.body.preheader
-; LINUX64-NEXT: xxlxor f0, f0, f0
-; LINUX64-NEXT: xxlxor vs1, vs1, vs1
-; LINUX64-NEXT: xxlxor f2, f2, f2
-; LINUX64-NEXT: .p2align 4
-; LINUX64-NEXT: .LBB1_2: # %for.body
-; LINUX64-NEXT: # =>This Inner Loop Header: Depth=1
-; LINUX64-NEXT: xxmrghd vs2, vs0, vs2
-; LINUX64-NEXT: xvcvdpsp vs34, vs2
-; LINUX64-NEXT: xxspltd vs2, vs0, 0
-; LINUX64-NEXT: xvcvdpsp vs35, vs2
-; LINUX64-NEXT: xxspltiw vs2, 1170469888
-; LINUX64-NEXT: vmrgew v2, v3, v2
-; LINUX64-NEXT: xvcmpgtsp vs3, vs1, vs34
-; LINUX64-NEXT: xvmaddasp vs2, vs34, vs1
-; LINUX64-NEXT: xxland vs2, vs3, vs2
-; LINUX64-NEXT: xxsldwi vs2, vs2, vs2, 3
-; LINUX64-NEXT: xscvspdpn f2, vs2
-; LINUX64-NEXT: b .LBB1_2
-
-; CHECK32: .bar:
-; CHECK32-NEXT: # %bb.0: # %entry
-; CHECK32-NEXT: lwz r5, 0(r5)
-; CHECK32-NEXT: cmpwi r5, 0
-; CHECK32-NEXT: blelr cr0
-; CHECK32-NEXT: # %bb.1: # %for.body.preheader
-; CHECK32-NEXT: xxspltiw vs0, 1069066811
-; CHECK32-NEXT: xxspltiw vs1, 1170469888
-; CHECK32-NEXT: li r6, 0
-; CHECK32-NEXT: li r7, 0
-; CHECK32-NEXT: .align 4
-; CHECK32-NEXT: [[L2_foo:.*]]: # %for.body
-; CHECK32-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK32-NEXT: slwi r8, r7, 4
-; CHECK32-NEXT: addic r7, r7, 1
-; CHECK32-NEXT: addze r6, r6
-; CHECK32-NEXT: lxvx vs2, r4, r8
-; CHECK32-NEXT: xvmaddmsp vs2, vs0, vs1
-; CHECK32-NEXT: stxvx vs2, r3, r8
-; CHECK32-NEXT: xor r8, r7, r5
-; CHECK32-NEXT: or. r8, r8, r6
-; CHECK32-NEXT: bne cr0, [[L2_foo]]
-
-; CHECK32: .foo:
-; CHECK32-NEXT: # %bb.0: # %entry
-; CHECK32-NEXT: andi. r3, r3, 1
-; CHECK32-NEXT: bclr 4, gt, 0
-; CHECK32-NEXT: # %bb.1: # %for.body.preheader
-; CHECK32-NEXT: lwz r3, L..C0(r2) # %const.0
-; CHECK32-NEXT: xxlxor f1, f1, f1
-; CHECK32-NEXT: xxlxor vs0, vs0, vs0
-; CHECK32-NEXT: xscvdpspn vs35, f1
-; CHECK32-NEXT: lxv vs34, 0(r3)
-; CHECK32-NEXT: .align 4
-; CHECK32-NEXT: L..BB1_2: # %for.body
-; CHECK32-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK32-NEXT: xscvdpspn vs36, f1
-; CHECK32-NEXT: xxspltiw vs1, 1170469888
-; CHECK32-NEXT: vperm v4, v4, v3, v2
-; CHECK32-NEXT: xvcmpgtsp vs2, vs0, vs36
-; CHECK32-NEXT: xvmaddasp vs1, vs36, vs0
-; CHECK32-NEXT: xxland vs1, vs2, vs1
-; CHECK32-NEXT: xscvspdpn f1, vs1
-; CHECK32-NEXT: b L..BB1_2
diff --git a/llvm/test/CodeGen/X86/break-false-dep.ll b/llvm/test/CodeGen/X86/break-false-dep.ll
index 6943622fac7f2..a6ad3018e052c 100644
--- a/llvm/test/CodeGen/X86/break-false-dep.ll
+++ b/llvm/test/CodeGen/X86/break-false-dep.ll
@@ -472,17 +472,17 @@ define dso_local void @loopdep3() {
; SSE-WIN-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill
; SSE-WIN-NEXT: .seh_savexmm %xmm6, 0
; SSE-WIN-NEXT: .seh_endprologue
-; SSE-WIN-NEXT: xorl %eax, %eax
-; SSE-WIN-NEXT: leaq v(%rip), %rcx
-; SSE-WIN-NEXT: leaq x(%rip), %rdx
-; SSE-WIN-NEXT: leaq y(%rip), %r8
-; SSE-WIN-NEXT: leaq z(%rip), %r9
-; SSE-WIN-NEXT: leaq w(%rip), %r10
+; SSE-WIN-NEXT: leaq v(%rip), %rax
+; SSE-WIN-NEXT: leaq x(%rip), %rcx
+; SSE-WIN-NEXT: leaq y(%rip), %rdx
+; SSE-WIN-NEXT: leaq z(%rip), %r8
+; SSE-WIN-NEXT: leaq w(%rip), %r9
+; SSE-WIN-NEXT: xorl %r10d, %r10d
; SSE-WIN-NEXT: .p2align 4
; SSE-WIN-NEXT: .LBB8_1: # %for.cond1.preheader
; SSE-WIN-NEXT: # =>This Loop Header: Depth=1
; SSE-WIN-NEXT: # Child Loop BB8_2 Depth 2
-; SSE-WIN-NEXT: movq %rcx, %r11
+; SSE-WIN-NEXT: movq %rax, %r11
; SSE-WIN-NEXT: xorl %esi, %esi
; SSE-WIN-NEXT: .p2align 4
; SSE-WIN-NEXT: .LBB8_2: # %for.body3
@@ -490,10 +490,10 @@ define dso_local void @loopdep3() {
; SSE-WIN-NEXT: # => This Inner Loop Header: Depth=2
; SSE-WIN-NEXT: xorps %xmm0, %xmm0
; SSE-WIN-NEXT: cvtsi2sdl (%r11), %xmm0
+; SSE-WIN-NEXT: mulsd (%rsi,%rcx), %xmm0
; SSE-WIN-NEXT: mulsd (%rsi,%rdx), %xmm0
; SSE-WIN-NEXT: mulsd (%rsi,%r8), %xmm0
-; SSE-WIN-NEXT: mulsd (%rsi,%r9), %xmm0
-; SSE-WIN-NEXT: movsd %xmm0, (%rsi,%r10)
+; SSE-WIN-NEXT: movsd %xmm0, (%rsi,%r9)
; SSE-WIN-NEXT: #APP
; SSE-WIN-NEXT: #NO_APP
; SSE-WIN-NEXT: addq $8, %rsi
@@ -502,8 +502,8 @@ define dso_local void @loopdep3() {
; SSE-WIN-NEXT: jne .LBB8_2
; SSE-WIN-NEXT: # %bb.3: # %for.inc14
; SSE-WIN-NEXT: # in Loop: Header=BB8_1 Depth=1
-; SSE-WIN-NEXT: incl %eax
-; SSE-WIN-NEXT: cmpl $100000, %eax # imm = 0x186A0
+; SSE-WIN-NEXT: incl %r10d
+; SSE-WIN-NEXT: cmpl $100000, %r10d # imm = 0x186A0
; SSE-WIN-NEXT: jne .LBB8_1
; SSE-WIN-NEXT: # %bb.4: # %for.end16
; SSE-WIN-NEXT: movaps (%rsp), %xmm6 # 16-byte Reload
@@ -550,17 +550,17 @@ define dso_local void @loopdep3() {
; AVX-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill
; AVX-NEXT: .seh_savexmm %xmm6, 0
; AVX-NEXT: .seh_endprologue
-; AVX-NEXT: xorl %eax, %eax
-; AVX-NEXT: leaq v(%rip), %rcx
-; AVX-NEXT: leaq x(%rip), %rdx
-; AVX-NEXT: leaq y(%rip), %r8
-; AVX-NEXT: leaq z(%rip), %r9
-; AVX-NEXT: leaq w(%rip), %r10
+; AVX-NEXT: leaq v(%rip), %rax
+; AVX-NEXT: leaq x(%rip), %rcx
+; AVX-NEXT: leaq y(%rip), %rdx
+; AVX-NEXT: leaq z(%rip), %r8
+; AVX-NEXT: leaq w(%rip), %r9
+; AVX-NEXT: xorl %r10d, %r10d
; AVX-NEXT: .p2align 4
; AVX-NEXT: .LBB8_1: # %for.cond1.preheader
; AVX-NEXT: # =>This Loop Header: Depth=1
; AVX-NEXT: # Child Loop BB8_2 Depth 2
-; AVX-NEXT: movq %rcx, %r11
+; AVX-NEXT: movq %rax, %r11
; AVX-NEXT: xorl %esi, %esi
; AVX-NEXT: .p2align 4
; AVX-NEXT: .LBB8_2: # %for.body3
@@ -568,10 +568,10 @@ define dso_local void @loopdep3() {
; AVX-NEXT: # => This Inner Loop Header: Depth=2
; AVX-NEXT: vxorps %xmm5, %xmm5, %xmm5
; AVX-NEXT: vcvtsi2sdl (%r11), %xmm5, %xmm0
+; AVX-NEXT: vmulsd (%rsi,%rcx), %xmm0, %xmm0
; AVX-NEXT: vmulsd (%rsi,%rdx), %xmm0, %xmm0
; AVX-NEXT: vmulsd (%rsi,%r8), %xmm0, %xmm0
-; AVX-NEXT: vmulsd (%rsi,%r9), %xmm0, %xmm0
-; AVX-NEXT: vmovsd %xmm0, (%rsi,%r10)
+; AVX-NEXT: vmovsd %xmm0, (%rsi,%r9)
; AVX-NEXT: #APP
; AVX-NEXT: #NO_APP
; AVX-NEXT: addq $8, %rsi
@@ -580,8 +580,8 @@ define dso_local void @loopdep3() {
; AVX-NEXT: jne .LBB8_2
; AVX-NEXT: # %bb.3: # %for.inc14
; AVX-NEXT: # in Loop: Header=BB8_1 Depth=1
-; AVX-NEXT: incl %eax
-; AVX-NEXT: cmpl $100000, %eax # imm = 0x186A0
+; AVX-NEXT: incl %r10d
+; AVX-NEXT: cmpl $100000, %r10d # imm = 0x186A0
; AVX-NEXT: jne .LBB8_1
; AVX-NEXT: ...
[truncated]
|
e38181c to
01e724e
Compare
When checking whether it is profitable to hoist an instruction, the pass may override a target's ruling because it assumes that all COPY instructions are cheap, and that may not be the case for all micro-architectures. On AArch64 there's 0% difference in performance in LLVM's test-suite.
01e724e to
1a300af
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
When checking whether it is profitable to hoist an instruction, the pass may override a target's ruling because it assumes that all COPY instructions are cheap, and that may not be the case for all micro-architectures (especially for when copying between different register classes).
On AArch64 there's 0% difference in performance in LLVM's test-suite with this change. Additionally, very few tests were affected which shows how it is not so useful to keep it.
x86 performance is slightly better (but maybe that's just noise) for an A/B comparison consisting of five iterations on LLVM's test suite (Ryzen 5950X on Ubuntu):