-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[PowerPC] hoist xxspltiw instruction out of the loop with FMA mutation pass. #111696
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-powerpc Author: zhijian lin (diggerlin) ChangesFull diff: https://github.com/llvm/llvm-project/pull/111696.diff 2 Files Affected:
diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index 7d0455942923dd..a9e8d038ffd8bd 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -572,7 +572,8 @@ void PPCPassConfig::addMachineSSAOptimization() {
void PPCPassConfig::addPreRegAlloc() {
if (getOptLevel() != CodeGenOptLevel::None) {
initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry());
- insertPass(VSXFMAMutateEarly ? &RegisterCoalescerID : &MachineSchedulerID,
+ insertPass(VSXFMAMutateEarly ? &TwoAddressInstructionPassID
+ : &MachineSchedulerID,
&PPCVSXFMAMutateID);
}
diff --git a/llvm/test/CodeGen/PowerPC/vsx-fma-m-early.ll b/llvm/test/CodeGen/PowerPC/vsx-fma-m-early.ll
new file mode 100644
index 00000000000000..fa86fe7664e41c
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/vsx-fma-m-early.ll
@@ -0,0 +1,77 @@
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -disable-ppc-vsx-fma-mutation=false -ppc-asm-full-reg-names -schedule-ppc-vsx-fma-mutation-early < %s | \
+; RUN: FileCheck --check-prefix=CHECK-M %s
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -disable-ppc-vsx-fma-mutation=false -ppc-asm-full-reg-names < %s | \
+; RUN: FileCheck --check-prefix=CHECK-A %s
+
+target triple = "powerpc64-ibm-aix7.2.0.0"
+define void @vsexp(ptr noalias nocapture noundef writeonly %__output_a, ptr noalias nocapture noundef readonly %var1321In_a, ptr noalias nocapture noundef readonly %n) local_unnamed_addr #0 {
+entry:
+ %0 = load i32, ptr %n, align 4
+ %cmp11 = icmp sgt i32 %0, 0
+ br i1 %cmp11, label %for.body.preheader, label %for.end
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %0 to i64
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %1 = shl nsw i64 %indvars.iv, 2
+ %add.ptr = getelementptr inbounds float, ptr %var1321In_a, i64 %1
+ %add.ptr.val = load <4 x float>, ptr %add.ptr, align 1
+ %2 = tail call contract <4 x float> @llvm.fma.v4f32(<4 x float> %add.ptr.val, <4 x float> <float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000>, <4 x float> <float 6.270500e+03, float 6.270500e+03, float 6.270500e+03, float 6.270500e+03>)
+ %add.ptr6 = getelementptr inbounds float, ptr %__output_a, i64 %1
+ store <4 x float> %2, ptr %add.ptr6, align 1
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+; CHECK-M: .csect ..text..[PR],5{{[[:space:]].*}}.vsexp:
+; CHECK-M-NEXT: # %bb.0: # %entry
+; CHECK-M-NEXT: lwz r5, 0(r5)
+; CHECK-M-NEXT: cmpwi r5, 1
+; CHECK-M-NEXT: bltlr cr0
+; CHECK-M-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-M-NEXT: xxspltiw vs0, 1069066811
+; CHECK-M-NEXT: xxspltiw vs1, 1170469888
+; CHECK-M-NEXT: mtctr r5
+; CHECK-M-NEXT: li r5, 0
+; CHECK-M-NEXT: .align 5
+; CHECK-M-NEXT: L..BB0_2: # %for.body
+; CHECK-M-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-M-NEXT: lxvx vs2, r4, r5
+; CHECK-M-NEXT: xvmaddmsp vs2, vs0, vs1
+; CHECK-M-NEXT: stxvx vs2, r3, r5
+; CHECK-M-NEXT: addi r5, r5, 16
+; CHECK-M-NEXT: bdnz L..BB0_2
+; CHECK-M-NEXT: # %bb.3: # %for.end
+; CHECK-M-NEXT: blr
+
+; CHECK-A: .csect ..text..[PR],5{{[[:space:]].*}}.vsexp:
+; CHECK-A-NEXT: # %bb.0: # %entry
+; CHECK-A-NEXT: lwz r5, 0(r5)
+; CHECK-A-NEXT: cmpwi r5, 1
+; CHECK-A-NEXT: bltlr cr0
+; CHECK-A-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-A-NEXT: xxspltiw vs0, 1069066811
+; CHECK-A-NEXT: mtctr r5
+; CHECK-A-NEXT: li r5, 0
+; CHECK-A-NEXT: .align 5
+; CHECK-A-NEXT: L..BB0_2: # %for.body
+; CHECK-A-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-A-NEXT: lxvx vs1, r4, r5
+; CHECK-A-NEXT: xxspltiw vs2, 1170469888
+; CHECK-A-NEXT: xvmaddasp vs2, vs1, vs0
+; CHECK-A-NEXT: stxvx vs2, r3, r5
+; CHECK-A-NEXT: addi r5, r5, 16
+; CHECK-A-NEXT: bdnz L..BB0_2
+; CHECK-A-NEXT: # %bb.3: # %for.end
+; CHECK-A-NEXT: blr
|
PowerPC VSX FMA Mutation pass head of Register Coalescer passPowerPC VSX FMA Mutation pass head of Register Coalescer pass for option -schedule-ppc-vsx-fma-mutation-early
|
The description for this PR is way too long to be used as a commit message. I think it's better if you put the details in the issue this PR fixes (open one if there isn't one) and provide a more general commit message here. |
PowerPC VSX FMA Mutation pass head of Register Coalescer pass for option -schedule-ppc-vsx-fma-mutation-earlyPowerPC VSX FMA Mutation pass ahead of the Register Coalescer pass when the -schedule-ppc-vsx-fma-mutation-early option is enabled.
PowerPC VSX FMA Mutation pass ahead of the Register Coalescer pass when the -schedule-ppc-vsx-fma-mutation-early option is enabled.-schedule-ppc-vsx-fma-mutation-early
-schedule-ppc-vsx-fma-mutation-early-schedule-ppc-vsx-fma-mutation-early
amy-kwan
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree with Lei, the description is a bit too long and should concisely summarize the patch.
b3f6120 to
811c86c
Compare
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
811c86c to
c99bff3
Compare
-schedule-ppc-vsx-fma-mutation-earlyAdd a pre- commit test case for Patch #111696 Test ppc-vsx-fma-mutate pass work with -schedule-ppc-vsx-fma-mutation-early not hoist the instruction `xxspltiw vs2, 1170469888` out the loop. --------- Co-authored-by: Amy Kwan <[email protected]>
c99bff3 to
3e605b1
Compare
Add a pre- commit test case for Patch llvm/llvm-project#111696 Test ppc-vsx-fma-mutate pass work with -schedule-ppc-vsx-fma-mutation-early not hoist the instruction `xxspltiw vs2, 1170469888` out the loop. --------- Co-authored-by: Amy Kwan <[email protected]>
Add a pre- commit test case for Patch llvm#111696 Test ppc-vsx-fma-mutate pass work with -schedule-ppc-vsx-fma-mutation-early not hoist the instruction `xxspltiw vs2, 1170469888` out the loop. --------- Co-authored-by: Amy Kwan <[email protected]>
lei137
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Thx!
Summary:
The patch fixes the issue [PowerPC] missing VSX FMA Mutation optimize in some case for option -schedule-ppc-vsx-fma-mutation-early #111906
In certain cases, the Register Coalescer pass—which eliminates COPY instructions—can interfere with the PowerPC VSX FMA Mutation pass. Specifically, it can prevent the mutation of a COPY adjacent to an XSMADDADP into a single XSMADDMDP instruction. As a result, the xxspltiw instruction is not hoisted out of the loop as expected, leading to missed optimization opportunities.
To address this, the patch ensures that the
VSX FMA Mutationpass runs before theRegister Coalescerpass when the -schedule-ppc-vsx-fma-mutation-early option is enabled.