[PPC]Optimize zeroing accumulator and spilling instructions into simple instructions #96094

diggerlin · 2024-06-19T17:25:19Z

in the patch , it will optimize the following instructions

        xxsetaccz 0
        xxmfacc 0
        stxv 0, 1792(1)
        stxv 1, 1776(1)
        stxv 2, 1760(1)
        stxv 3, 1744(1)

to

       xxlxor 0, 0, 0
      stxv 0, 1792(1)
      stxv 0, 1776(1)
      stxv 0, 1760(1)
      stxv 0, 1744(1)

llvmbot · 2024-06-19T17:25:47Z

@llvm/pr-subscribers-backend-powerpc

Author: zhijian lin (diggerlin)

Changes

in the patch , it will optimize the following instructions

        xxsetaccz 0
        xxmfacc 0
        stxv 0, 1792(1)
        stxv 1, 1776(1)
        stxv 2, 1760(1)
        stxv 3, 1744(1)

to

       xxlxor 0, 0, 0
      stxv 0, 1792(1)
      stxv 0, 1776(1)
      stxv 0, 1760(1)
      stxv 0, 1744(1)

Full diff: https://github.com/llvm/llvm-project/pull/96094.diff

2 Files Affected:

(modified) llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp (+88)
(modified) llvm/test/CodeGen/PowerPC/mma-intrinsics.ll (+16-20)

diff --git a/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp b/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
index d45edd74ab854..4dac9d1708d8a 100644
--- a/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
+++ b/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
@@ -109,6 +109,93 @@ static bool hasPCRelativeForm(MachineInstr &Use) {
           MachineFunctionProperties::Property::NoVRegs);
     }
 
+    // The funtion will simply the zeroing accumulator and spilling instrcutions
+    // into simple xxlxor and spilling instrcuctions.
+    // From:
+    // setaccz acci
+    // xxmfacc acci
+    // stxv vsr(i*4+0), D(1)
+    // stxv vsr(i*4+1), D-16(1)
+    // stxv vsr(i*4+2), D-32(1)
+    // stxv vsr(i*4+3), D-48(1)
+
+    // To:
+    // xxlxor vsr(i*4), 0, 0
+    // stxv vsr(i*4), D(1)
+    // stxv vsr(i*4), D-16(1)
+    // stxv vsr(i*4), D-32(1)
+    // stxv vsr(i*4), D-48(1)
+    bool
+    OptimizeZeroingAccumulatorSpilling(MachineBasicBlock &MBB,
+                                       const TargetRegisterInfo *TRI) const {
+      bool changed = false;
+      for (auto BBI = MBB.instr_begin(); BBI != MBB.instr_end(); ++BBI) {
+        if (BBI->getOpcode() != PPC::XXSETACCZ)
+          continue;
+
+        Register ACCZReg = BBI->getOperand(0).getReg();
+
+        DenseSet<MachineInstr *> InstrsToErase;
+        InstrsToErase.insert(&*BBI++);
+
+        if (BBI->getOpcode() != PPC::XXMFACC) {
+	  --BBI;
+          continue;
+	}
+
+        Register ACCWReg = BBI->getOperand(0).getReg();
+
+        if (ACCWReg != ACCZReg) 
+          continue;
+
+        auto XXMFACCInstr = BBI;
+        InstrsToErase.insert(&*BBI++);
+
+        Register VSLRegBase = (ACCWReg - PPC::ACC0) * 4 + PPC::VSL0;
+        bool isVSLRegBaseKilled = false;
+        for (unsigned InstrCount = 0; InstrCount < 4; ++InstrCount, ++BBI) {
+          if (BBI->getOpcode() == PPC::STXV) {
+            Register Reg0 = BBI->getOperand(0).getReg();
+            // If the VSLRegBase Register is killed, we put the kill in the
+            // last STXV instruction.
+            if (Reg0 == VSLRegBase && BBI->getOperand(0).isKill())
+              isVSLRegBaseKilled = true;
+            if (Reg0 < VSLRegBase || Reg0 > VSLRegBase + 3)
+              continue;
+          } else {
+	      --BBI;
+              continue;
+	  }
+          }
+
+          BBI = XXMFACCInstr;
+          BBI++;
+          for (unsigned InstrCount = 0; InstrCount < 4; ++InstrCount, ++BBI) {
+            Register VSLiReg = BBI->getOperand(0).getReg();
+            BBI->substituteRegister(VSLiReg, VSLRegBase, 0, *TRI);
+            BBI->getOperand(0).setIsKill(false);
+          }
+
+          if (isVSLRegBaseKilled)
+            (--BBI)->getOperand(0).setIsKill(true);
+
+          DebugLoc DL = XXMFACCInstr->getDebugLoc();
+          const PPCInstrInfo *TII = XXMFACCInstr->getMF()
+                                        ->getSubtarget<PPCSubtarget>()
+                                        .getInstrInfo();
+
+          BuildMI(MBB, &*XXMFACCInstr, DL, TII->get(PPC::XXLXOR), VSLRegBase)
+              .addReg(VSLRegBase,RegState::Undef)
+              .addReg(VSLRegBase,RegState::Undef);
+
+          for (MachineInstr *MI : InstrsToErase)
+            MI->eraseFromParent();
+
+          changed |= true;
+        }
+      return changed;
+    }
+
     // This function removes any redundant load immediates. It has two level
     // loops - The outer loop finds the load immediates BBI that could be used
     // to replace following redundancy. The inner loop scans instructions that
@@ -466,6 +553,7 @@ static bool hasPCRelativeForm(MachineInstr &Use) {
         Changed |= removeRedundantLIs(MBB, TRI);
         Changed |= addLinkerOpt(MBB, TRI);
         Changed |= removeAccPrimeUnprime(MBB);
+        Changed |= OptimizeZeroingAccumulatorSpilling(MBB, TRI);
         for (MachineInstr &MI : MBB) {
           unsigned Opc = MI.getOpcode();
           if (Opc == PPC::UNENCODED_NOP) {
diff --git a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
index 53b0a2737122e..17e24eefc2580 100644
--- a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
@@ -115,22 +115,20 @@ declare <512 x i1> @llvm.ppc.mma.xxsetaccz()
 define void @int_xxsetaccz(ptr %ptr) {
 ; CHECK-LABEL: int_xxsetaccz:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xxsetaccz acc0
-; CHECK-NEXT:    xxmfacc acc0
+; CHECK-NEXT:    xxlxor vs0, vs0, vs0
 ; CHECK-NEXT:    stxv vs0, 48(r3)
-; CHECK-NEXT:    stxv vs1, 32(r3)
-; CHECK-NEXT:    stxv vs2, 16(r3)
-; CHECK-NEXT:    stxv vs3, 0(r3)
+; CHECK-NEXT:    stxv vs0, 32(r3)
+; CHECK-NEXT:    stxv vs0, 16(r3)
+; CHECK-NEXT:    stxv vs0, 0(r3)
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: int_xxsetaccz:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    xxsetaccz acc0
-; CHECK-BE-NEXT:    xxmfacc acc0
-; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    xxlxor vs0, vs0, vs0
+; CHECK-BE-NEXT:    stxv vs0, 16(r3)
 ; CHECK-BE-NEXT:    stxv vs0, 0(r3)
-; CHECK-BE-NEXT:    stxv vs3, 48(r3)
-; CHECK-BE-NEXT:    stxv vs2, 32(r3)
+; CHECK-BE-NEXT:    stxv vs0, 48(r3)
+; CHECK-BE-NEXT:    stxv vs0, 32(r3)
 ; CHECK-BE-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
@@ -143,22 +141,20 @@ declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble
 define void @disass_acc(ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr %ptr4) {
 ; CHECK-LABEL: disass_acc:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xxsetaccz acc0
-; CHECK-NEXT:    xxmfacc acc0
-; CHECK-NEXT:    stxv vs3, 0(r3)
-; CHECK-NEXT:    stxv vs2, 0(r4)
-; CHECK-NEXT:    stxv vs1, 0(r5)
+; CHECK-NEXT:    xxlxor vs0, vs0, vs0 
+; CHECK-NEXT:    stxv vs0, 0(r3)
+; CHECK-NEXT:    stxv vs0, 0(r4)
+; CHECK-NEXT:    stxv vs0, 0(r5)
 ; CHECK-NEXT:    stxv vs0, 0(r6)
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: disass_acc:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    xxsetaccz acc0
-; CHECK-BE-NEXT:    xxmfacc acc0
+; CHECK-BE-NEXT:    xxlxor vs0, vs0, vs0
 ; CHECK-BE-NEXT:    stxv vs0, 0(r3)
-; CHECK-BE-NEXT:    stxv vs1, 0(r4)
-; CHECK-BE-NEXT:    stxv vs2, 0(r5)
-; CHECK-BE-NEXT:    stxv vs3, 0(r6)
+; CHECK-BE-NEXT:    stxv vs0, 0(r4)
+; CHECK-BE-NEXT:    stxv vs0, 0(r5)
+; CHECK-BE-NEXT:    stxv vs0, 0(r6)
 ; CHECK-BE-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()

chenzheng1030 · 2024-06-25T02:00:11Z

llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp

+
+        DenseSet<MachineInstr *> InstrsToErase;
+        InstrsToErase.insert(&*BBI++);
+


This does not look good. We need a more generic handling for all cases. For now, this is targeted for a specific case in the comment. Compiler scheduling may change the order of these instructions.

it is not necessary there must be 4 stores...

limiting the case that XXSETACCZ and XXMFACC must be adjacent is not a good idea...

limitting the case that the four stores must be followed XXMFACC is also not good.

The loop structure also seems not able to handle case

// setaccz acci // xxmfacc acci // setaccz acci2 // xxmfacc acci2 // stxv vsr(i*4+0), D(1) // stxv vsr(i*4+1), D-16(1) // stxv vsr(i*4+2), D-32(1) // stxv vsr(i*4+3), D-48(1) // stxv vsr(i2*4+0), D(2) // stxv vsr(i2*4+1), D-16(2) // stxv vsr(i2*4+2), D-32(2) // stxv vsr(i2*4+3), D-48(2)

I agree.
We want this to be a general solution. I'm not sure that the approach of putting this so late in the pre-emit peephole is a good idea. When are these stores generated? Is this a spill? Is it just a store of a zeroed ACC register?

It may be easier to fix this the first time when the stores are generated because then the instruction scheduling won't be a problem and we just generate the instructions we want.

it is a accumulator register spill, there is another solution to check whether there is XXSETACCZ instruction in the function PPCRegisterInfo::lowerACCSpilling()

if there is the XXSETACCZ instruction , we do not do
https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp#L1336
BuildMI(MBB, II, DL, TII.get(PPC::XXMFACC), SrcReg).addReg(SrcReg);

we change the code

BuildMI(MBB, II, DL, TII.get(PPC::XXMFACC), SrcReg).addReg(SrcReg)

to something like

BuildMI(MBB, II, DL, TII->get(PPC::XXLXOR), Reg)

And in the function static void spillRegPairs change

addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV)) .addReg(Reg + i, getKillRegState(IsKilled)), FrameIndex, Offset);

to

addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV)) .addReg(Reg , getKillRegState(IsKilled)), FrameIndex, Offset);

but I do not think it is good place to put optimize in the function PPCRegisterInfo::lowerACCSpilling (the function is for spill functionality )

it is not necessary there must be 4 stores...

the patch is accumulator spill, it must has 4 stores.

limiting the case that XXSETACCZ and XXMFACC must be adjacent is not a good idea...

OK, I will deal the XXSETACCZ and XXMFACC are not adjacent.

limitting the case that the four stores must be followed XXMFACC is also not good.

the patch is accumulator spill, in the function PPCRegisterInfo::lowerACCSpilling. the XXMFACC is followed by
4 store, but I can deal with XXMFACC is not followed by 4 store any ways.

The loop structure also seems not able to handle case

I will deal with these case.

chenzheng1030 · 2024-06-25T03:01:59Z

llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp

+          if (BBI->getOpcode() == PPC::STXV) {
+            Register Reg0 = BBI->getOperand(0).getReg();
+            // If the VSLRegBase Register is killed, we put the kill in the
+            // last STXV instruction.


Add a FIXME here that we may need to update killed flag for other vsr as well.

chenzheng1030 · 2024-06-25T03:07:21Z

llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp

+              .addReg(VSLRegBase,RegState::Undef);
+
+          for (MachineInstr *MI : InstrsToErase)
+            MI->eraseFromParent();


Maybe we can put this function outside of the main loop? i.e., we collect all the unneeded instructions and remove them all just before this function returns.

chenzheng1030 · 2024-06-25T03:16:43Z

llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp

+    // From:
+    // setaccz acci
+    // xxmfacc acci
+    // stxv vsr(i*4+0), D(1)


What does D(1) mean? 1 is for R1? could we use another name like Base? And we don't check the offsets in the implementation, so maybe the D/D-16/D-32/D-48 is also not necessary?

chenzheng1030 · 2024-06-25T03:17:29Z

llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp

+    bool
+    OptimizeZeroingAccumulatorSpilling(MachineBasicBlock &MBB,
+                                       const TargetRegisterInfo *TRI) const {
+      bool changed = false;


nit: for variables, first letter is capitalized.

chenzheng1030 · 2024-06-25T03:26:43Z

llvm/test/CodeGen/PowerPC/mma-intrinsics.ll

-; CHECK-NEXT:    stxv vs3, 0(r3)
+; CHECK-NEXT:    stxv vs0, 32(r3)
+; CHECK-NEXT:    stxv vs0, 16(r3)
+; CHECK-NEXT:    stxv vs0, 0(r3)


Not related to this patch, the case at line 50 seems like an existing bug...

sorry that I can not got the comment, can you explain more detail ?

diggerlin · 2025-10-16T17:40:58Z

I close the PR.

first commit of replace XXSETACCZ

26beee4

llvmbot added the backend:PowerPC label Jun 19, 2024

diggerlin requested review from amy-kwan, chenzheng1030, mandlebug and stefanp-synopsys June 19, 2024 17:26

chenzheng1030 reviewed Jun 25, 2024

View reviewed changes

diggerlin added 2 commits August 9, 2024 15:38

deal with not adjacent instruction

2f8614a

Merge branch 'llvm:main' into digger/XXSETACCZ

1bca20b

lei137 requested a review from chenzheng1030 November 28, 2024 22:25

diggerlin marked this pull request as draft November 29, 2024 21:35

diggerlin closed this Oct 16, 2025


		DenseSet<MachineInstr *> InstrsToErase;
		InstrsToErase.insert(&*BBI++);

[PPC]Optimize zeroing accumulator and spilling instructions into simple instructions #96094

[PPC]Optimize zeroing accumulator and spilling instructions into simple instructions #96094

Uh oh!

Conversation

diggerlin commented Jun 19, 2024

Uh oh!

llvmbot commented Jun 19, 2024

Uh oh!

chenzheng1030 Jun 25, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

stefanp-synopsys Jul 17, 2024

Choose a reason for hiding this comment

Uh oh!

diggerlin Aug 8, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

diggerlin Aug 8, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

chenzheng1030 Jun 25, 2024

Choose a reason for hiding this comment

Uh oh!

chenzheng1030 Jun 25, 2024

Choose a reason for hiding this comment

Uh oh!

chenzheng1030 Jun 25, 2024

Choose a reason for hiding this comment

Uh oh!

chenzheng1030 Jun 25, 2024

Choose a reason for hiding this comment

Uh oh!

chenzheng1030 Jun 25, 2024

Choose a reason for hiding this comment

Uh oh!

diggerlin Aug 9, 2024

Choose a reason for hiding this comment

Uh oh!

diggerlin commented Oct 16, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

chenzheng1030 Jun 25, 2024 •

edited

Loading

diggerlin Aug 8, 2024 •

edited

Loading

diggerlin Aug 8, 2024 •

edited

Loading

diggerlin commented Oct 16, 2025 •

edited

Loading