-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[PowerPC] hoist xxspltib out of loop body #127121
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-powerpc Author: zhijian lin (diggerlin) Changesthe patch fix the issue #127119 Full diff: https://github.com/llvm/llvm-project/pull/127121.diff 3 Files Affected:
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index 8e400bc63b785..d9e88b283a749 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -1659,8 +1659,6 @@ let Predicates = [HasVSX, HasP9Vector] in {
RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">;
// Vector Splat Immediate Byte
- // FIXME: Setting the hasSideEffects flag here to match current behaviour.
- let hasSideEffects = 1 in
def XXSPLTIB : X_RD6_IMM8<60, 360, (outs vsrc:$XT), (ins u8imm:$IMM8),
"xxspltib $XT, $IMM8", IIC_VecPerm, []>;
diff --git a/llvm/test/CodeGen/PowerPC/memset-tail.ll b/llvm/test/CodeGen/PowerPC/memset-tail.ll
index 31c136d009ba5..ec9929e0d90dd 100644
--- a/llvm/test/CodeGen/PowerPC/memset-tail.ll
+++ b/llvm/test/CodeGen/PowerPC/memset-tail.ll
@@ -380,9 +380,9 @@ define dso_local void @memsetTailV1B2(ptr nocapture noundef writeonly %p) local_
;
; P9-BE-LABEL: memsetTailV1B2:
; P9-BE: # %bb.0: # %entry
+; P9-BE-NEXT: xxspltib 0, 15
; P9-BE-NEXT: li 4, 3855
; P9-BE-NEXT: sth 4, 16(3)
-; P9-BE-NEXT: xxspltib 0, 15
; P9-BE-NEXT: stxv 0, 0(3)
; P9-BE-NEXT: blr
;
@@ -433,9 +433,9 @@ define dso_local void @memsetTailV1B1(ptr nocapture noundef writeonly %p) local_
;
; P9-BE-LABEL: memsetTailV1B1:
; P9-BE: # %bb.0: # %entry
+; P9-BE-NEXT: xxspltib 0, 15
; P9-BE-NEXT: li 4, 15
; P9-BE-NEXT: stb 4, 16(3)
-; P9-BE-NEXT: xxspltib 0, 15
; P9-BE-NEXT: stxv 0, 0(3)
; P9-BE-NEXT: blr
;
@@ -861,9 +861,9 @@ define dso_local void @memset2TailV1B2(ptr nocapture noundef writeonly %p) local
;
; P9-BE-LABEL: memset2TailV1B2:
; P9-BE: # %bb.0: # %entry
+; P9-BE-NEXT: xxspltib 0, 165
; P9-BE-NEXT: li 4, -23131
; P9-BE-NEXT: sth 4, 16(3)
-; P9-BE-NEXT: xxspltib 0, 165
; P9-BE-NEXT: stxv 0, 0(3)
; P9-BE-NEXT: blr
;
@@ -917,9 +917,9 @@ define dso_local void @memset2TailV1B1(ptr nocapture noundef writeonly %p) local
;
; P9-BE-LABEL: memset2TailV1B1:
; P9-BE: # %bb.0: # %entry
+; P9-BE-NEXT: xxspltib 0, 165
; P9-BE-NEXT: li 4, -91
; P9-BE-NEXT: stb 4, 16(3)
-; P9-BE-NEXT: xxspltib 0, 165
; P9-BE-NEXT: stxv 0, 0(3)
; P9-BE-NEXT: blr
;
diff --git a/llvm/test/CodeGen/PowerPC/pr127119.ll b/llvm/test/CodeGen/PowerPC/pr127119.ll
new file mode 100644
index 0000000000000..2856433777712
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/pr127119.ll
@@ -0,0 +1,95 @@
+;; Test `xxspltib` hoist out loop.
+
+; RUN: llc -verify-machineinstrs -mtriple powerpc64-ibm-aix-xcoff \
+; RUN: %s -o - 2>&1 | FileCheck %s
+
+define void @_Z3fooPfS_Pi(ptr noalias nocapture noundef writeonly %_a, ptr noalias nocapture noundef readonly %In_a, ptr noalias nocapture noundef readonly %n) local_unnamed_addr #0 {
+entry:
+ %0 = load i32, ptr %n, align 4
+ %cmp9 = icmp sgt i32 %0, 0
+ br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %wide.trip.count = zext nneg i32 %0 to i64
+ %xtraiter = and i64 %wide.trip.count, 1
+ %1 = icmp eq i32 %0, 1
+ br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
+
+for.body.preheader.new:
+ %unroll_iter = and i64 %wide.trip.count, 2147483646
+ br label %for.body
+
+for.cond.cleanup.loopexit.unr-lcssa:
+ %indvars.iv.unr = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next.1, %for.body ]
+ %lcmp.mod.not = icmp eq i64 %xtraiter, 0
+ br i1 %lcmp.mod.not, label %for.cond.cleanup, label %for.body.epil
+
+for.body.epil:
+ %arrayidx.epil = getelementptr inbounds nuw float, ptr %In_a, i64 %indvars.iv.unr
+ %2 = load float, ptr %arrayidx.epil, align 4
+ %vecins.i.epil = insertelement <4 x float> poison, float %2, i64 0
+ %3 = bitcast <4 x float> %vecins.i.epil to <16 x i8>
+ %and1.i.epil = and <16 x i8> %3, <i8 6, i8 6, i8 6, i8 6, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>
+ %4 = bitcast <16 x i8> %and1.i.epil to <4 x float>
+ %vecext.i.epil = extractelement <4 x float> %4, i64 0
+ %arrayidx5.epil = getelementptr inbounds nuw float, ptr %_a, i64 %indvars.iv.unr
+ store float %vecext.i.epil, ptr %arrayidx5.epil, align 4
+ br label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.preheader.new ], [ %indvars.iv.next.1, %for.body ]
+ %niter = phi i64 [ 0, %for.body.preheader.new ], [ %niter.next.1, %for.body ]
+ %arrayidx = getelementptr inbounds nuw float, ptr %In_a, i64 %indvars.iv
+ %5 = load float, ptr %arrayidx, align 4
+ %vecins.i = insertelement <4 x float> poison, float %5, i64 0
+ %6 = bitcast <4 x float> %vecins.i to <16 x i8>
+ %and1.i = and <16 x i8> %6, <i8 6, i8 6, i8 6, i8 6, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>
+ %7 = bitcast <16 x i8> %and1.i to <4 x float>
+ %vecext.i = extractelement <4 x float> %7, i64 0
+ %arrayidx5 = getelementptr inbounds nuw float, ptr %_a, i64 %indvars.iv
+ store float %vecext.i, ptr %arrayidx5, align 4
+ %indvars.iv.next = or disjoint i64 %indvars.iv, 1
+ %arrayidx.1 = getelementptr inbounds nuw float, ptr %In_a, i64 %indvars.iv.next
+ %8 = load float, ptr %arrayidx.1, align 4
+ %vecins.i.1 = insertelement <4 x float> poison, float %8, i64 0
+ %9 = bitcast <4 x float> %vecins.i.1 to <16 x i8>
+ %and1.i.1 = and <16 x i8> %9, <i8 6, i8 6, i8 6, i8 6, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>
+ %10 = bitcast <16 x i8> %and1.i.1 to <4 x float>
+ %vecext.i.1 = extractelement <4 x float> %10, i64 0
+ %arrayidx5.1 = getelementptr inbounds nuw float, ptr %_a, i64 %indvars.iv.next
+ store float %vecext.i.1, ptr %arrayidx5.1, align 4
+ %indvars.iv.next.1 = add nuw nsw i64 %indvars.iv, 2
+ %niter.next.1 = add i64 %niter, 2
+ %niter.ncmp.1 = icmp eq i64 %niter.next.1, %unroll_iter
+ br i1 %niter.ncmp.1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
+}
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind memory(argmem: readwrite) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pwr10" "target-features"="+altivec,+bpermd,+crbits,+crypto,+direct-move,+extdiv,+isa-v206-instructions,+isa-v207-instructions,+isa-v30-instructions,+isa-v31-instructions,+mma,+paired-vector-memops,+pcrelative-memops,+power10-vector,+power8-vector,+power9-vector,+prefix-instrs,+quadword-atomics,+vsx,-aix-shared-lib-tls-model-opt,-aix-small-local-dynamic-tls,-aix-small-local-exec-tls,-htm,-privileged,-rop-protect,-spe" }
+
+; CHECK: ._Z3fooPfS_Pi:
+; CHECK-NEXT: # %bb.0: # %entry
+; CHECK-NEXT: lwz 5, 0(5)
+; CHECK-NEXT: cmpwi 5, 1
+; CHECK-NEXT: bltlr 0
+; CHECK-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-NEXT: li 6, 0
+; CHECK-NEXT: cmplwi 5, 1
+; CHECK-NEXT: beq 0, L..BB0_4
+; CHECK-NEXT: # %bb.2: # %for.body.preheader.new
+; CHECK-NEXT: rlwinm 6, 5, 0, 1, 30
+; CHECK-NEXT: xxspltib 0, 6
+; CHECK-NEXT: addi 9, 4, -8
+; CHECK-NEXT: addi 7, 3, -8
+; CHECK-NEXT: li 8, 8
+; CHECK-NEXT: li 10, 12
+; CHECK-NEXT: li 11, 4
+; CHECK-NEXT: addi 6, 6, -2
+; CHECK-NEXT: rldicl 6, 6, 63, 1
+; CHECK-NEXT: addi 6, 6, 1
+; CHECK-NEXT: mtctr 6
+; CHECK-NEXT: li 6, 0
+; CHECK-NEXT: .align 4
+; CHECK-NEXT: L..BB0_3:
|
lei137
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In general this looks good to me. However, would be good if you can do a NFC pre-commit of the new test added, so this patch can show the changes.
I created a separated NFC patch for the new test case added. #127701 |
…ists xxsplitib out of loop (#127701) This is a pre-commit test case for patch llvm/llvm-project#127121 that hoists xxsplitib out of loop
ad69826 to
aedecc5
Compare
|
rebased the code based on the pre-commit 481e1eb |
lei137
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
I made a small update to your subject title and descripton body. Hope that is okay.
Fixes #127119
Remove
hasSideEffectsfromxxspltibsince there is no special restriction specified in the ISA that prevent it from being reordered, move, CSE, or LICM. Removing this restriction will allowxxspltibto be hoisted out of loop bodies.