Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 144 additions & 0 deletions llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,149 @@ static bool hasPCRelativeForm(MachineInstr &Use) {
MachineFunctionProperties::Property::NoVRegs);
}

// The funtion will simply the zeroing accumulator and spilling instrcutions
// into simple xxlxor and spilling instrcuctions.
// From:
// setaccz acci
// xxmfacc acci
// stxv vsr(i*4+0), D(Base)
// stxv vsr(i*4+1), D-16(Base)
// stxv vsr(i*4+2), D-32(Base)
// stxv vsr(i*4+3), D-48(Base)

// To:
// xxlxor vsr(i*4), 0, 0
// stxv vsr(i*4), D(Base)
// stxv vsr(i*4), D-16(Base)
// stxv vsr(i*4), D-32(Base)
// stxv vsr(i*4), D-48(Base)
bool
OptimizeZeroingAccumulatorSpilling(MachineBasicBlock &MBB,
const TargetRegisterInfo *TRI) const {
bool Changed = false;
DenseSet<MachineInstr *> InstrsToErase;
for (auto BBI = MBB.instr_begin(); BBI != MBB.instr_end(); ++BBI) {
if (BBI->getOpcode() != PPC::XXSETACCZ)
continue;
Register ACCZReg = BBI->getOperand(0).getReg();
MachineInstr *XXSETACCZInstr = nullptr;
MachineInstr *XXMFACCInstr = nullptr;
auto STXVInstrIter = MBB.begin();
bool isVSLRegBaseKilled = false;
Register VSLRegBase;

XXSETACCZInstr = &*BBI++;
for (auto TBBI = BBI; TBBI != MBB.instr_end(); ++TBBI) {
if (!XXMFACCInstr) {
if (TBBI->getOpcode() != PPC::XXMFACC) {
// Check whether the accumulator is redefined between XXSETACCZ
// and XXMFACC. we will not optimize them.
bool IsACCZRegRedefined = false;
for (unsigned i = 0; i < TBBI->getNumOperands(); i++) {
MachineOperand &Operand = TBBI->getOperand(i);
if (!Operand.isReg())
continue;
Register OperandReg = Operand.getReg();
// Check whether the accumulator `ACCZReg` is redefined.
if (OperandReg == ACCZReg && Operand.isDef())
IsACCZRegRedefined = true;
}
// If the ACCZReg is redefined, not check whether the `XXSETACCZ`
// has a corresponding `XXMFACC` any more.
if (IsACCZRegRedefined)
break;

continue;
} else {
// Check if XXSETACCZ uses the same accumulator as the `XXMFACC`
// instruction.
if (TBBI->getOperand(0).getReg() != ACCZReg)
continue;
}

XXMFACCInstr = &*TBBI++;
VSLRegBase = (ACCZReg - PPC::ACC0) * 4 + PPC::VSL0;
}

// Check whether it is a PPC::STXV instruction.
if (TBBI->getOpcode() != PPC::STXV) {
bool isVSLRedefinedOrUsed = false;
// Check whether the VSL register mapped to ACCWReg is redefined or
// used by non-STXV instructions.
for (unsigned i = 0; i < TBBI->getNumOperands(); i++) {
MachineOperand &Operand = TBBI->getOperand(i);
if (!Operand.isReg())
continue;
Register OperandReg = Operand.getReg();
Register VSRpBase = (ACCZReg - PPC::ACC0) * 2 + PPC::VSRp0;
if ((OperandReg >= VSLRegBase && OperandReg <= VSLRegBase + 3) ||
(OperandReg > VSRpBase && OperandReg <= VSRpBase + 1)) {
isVSLRedefinedOrUsed = true;
break;
}
}
// If the VSL register mapped to ACCWReg is redefined or used by a
// non-STXV instruction, we will not perform the optimization.
if (isVSLRedefinedOrUsed) {
XXMFACCInstr = nullptr;
break;
}
} else {
// Check whether there are four STXV instructions continuously.
STXVInstrIter = TBBI;
for (unsigned InstrCount = 0; InstrCount < 4;
++InstrCount, ++TBBI) {
if (TBBI->getOpcode() == PPC::STXV) {
Register Reg0 = TBBI->getOperand(0).getReg();
// If the VSLRegBase Register is killed, we put the kill in the
// last STXV instruction.
// FIXME: We may need to update killed flag for other vsr as
// well.
if (Reg0 == VSLRegBase && TBBI->getOperand(0).isKill())
isVSLRegBaseKilled = true;
if (Reg0 >= VSLRegBase && Reg0 <= VSLRegBase + 3)
continue;
// The register operand of the STXV instruction is not a VSL
// register mapped to ACCWReg.
XXMFACCInstr = nullptr;
break;
}
}
}
// There are four consecutive STXV instructions.
break;
}

if (XXMFACCInstr && STXVInstrIter != MBB.begin()) {
for (unsigned InstrCount = 0; InstrCount < 4;
++InstrCount, ++STXVInstrIter) {
Register VSLReg = STXVInstrIter->getOperand(0).getReg();
STXVInstrIter->substituteRegister(VSLReg, VSLRegBase, 0, *TRI);
STXVInstrIter->getOperand(0).setIsKill(false);
}

if (isVSLRegBaseKilled)
(--STXVInstrIter)->getOperand(0).setIsKill(true);

DebugLoc DL = XXMFACCInstr->getDebugLoc();
const PPCInstrInfo *TII = XXMFACCInstr->getMF()
->getSubtarget<PPCSubtarget>()
.getInstrInfo();

BuildMI(MBB, XXMFACCInstr, DL, TII->get(PPC::XXLXOR), VSLRegBase)
.addReg(VSLRegBase, RegState::Undef)
.addReg(VSLRegBase, RegState::Undef);

InstrsToErase.insert(XXSETACCZInstr);
InstrsToErase.insert(XXMFACCInstr);
Changed |= true;
}
}
for (MachineInstr *MI : InstrsToErase)
MI->eraseFromParent();
return Changed;
}

// This function removes any redundant load immediates. It has two level
// loops - The outer loop finds the load immediates BBI that could be used
// to replace following redundancy. The inner loop scans instructions that
Expand Down Expand Up @@ -466,6 +609,7 @@ static bool hasPCRelativeForm(MachineInstr &Use) {
Changed |= removeRedundantLIs(MBB, TRI);
Changed |= addLinkerOpt(MBB, TRI);
Changed |= removeAccPrimeUnprime(MBB);
Changed |= OptimizeZeroingAccumulatorSpilling(MBB, TRI);
for (MachineInstr &MI : MBB) {
unsigned Opc = MI.getOpcode();
if (Opc == PPC::UNENCODED_NOP) {
Expand Down
54 changes: 24 additions & 30 deletions llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -115,22 +115,20 @@ declare <512 x i1> @llvm.ppc.mma.xxsetaccz()
define void @int_xxsetaccz(ptr %ptr) {
; CHECK-LABEL: int_xxsetaccz:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxsetaccz acc0
; CHECK-NEXT: xxmfacc acc0
; CHECK-NEXT: xxlxor vs0, vs0, vs0
; CHECK-NEXT: stxv vs0, 48(r3)
; CHECK-NEXT: stxv vs1, 32(r3)
; CHECK-NEXT: stxv vs2, 16(r3)
; CHECK-NEXT: stxv vs3, 0(r3)
; CHECK-NEXT: stxv vs0, 32(r3)
; CHECK-NEXT: stxv vs0, 16(r3)
; CHECK-NEXT: stxv vs0, 0(r3)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not related to this patch, the case at line 50 seems like an existing bug...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry that I can not got the comment, can you explain more detail ?

; CHECK-NEXT: blr
;
; CHECK-BE-LABEL: int_xxsetaccz:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: xxsetaccz acc0
; CHECK-BE-NEXT: xxmfacc acc0
; CHECK-BE-NEXT: stxv vs1, 16(r3)
; CHECK-BE-NEXT: xxlxor vs0, vs0, vs0
; CHECK-BE-NEXT: stxv vs0, 16(r3)
; CHECK-BE-NEXT: stxv vs0, 0(r3)
; CHECK-BE-NEXT: stxv vs3, 48(r3)
; CHECK-BE-NEXT: stxv vs2, 32(r3)
; CHECK-BE-NEXT: stxv vs0, 48(r3)
; CHECK-BE-NEXT: stxv vs0, 32(r3)
; CHECK-BE-NEXT: blr
entry:
%0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
Expand All @@ -143,22 +141,20 @@ declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble
define void @disass_acc(ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr %ptr4) {
; CHECK-LABEL: disass_acc:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxsetaccz acc0
; CHECK-NEXT: xxmfacc acc0
; CHECK-NEXT: stxv vs3, 0(r3)
; CHECK-NEXT: stxv vs2, 0(r4)
; CHECK-NEXT: stxv vs1, 0(r5)
; CHECK-NEXT: xxlxor vs0, vs0, vs0
; CHECK-NEXT: stxv vs0, 0(r3)
; CHECK-NEXT: stxv vs0, 0(r4)
; CHECK-NEXT: stxv vs0, 0(r5)
; CHECK-NEXT: stxv vs0, 0(r6)
; CHECK-NEXT: blr
;
; CHECK-BE-LABEL: disass_acc:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: xxsetaccz acc0
; CHECK-BE-NEXT: xxmfacc acc0
; CHECK-BE-NEXT: xxlxor vs0, vs0, vs0
; CHECK-BE-NEXT: stxv vs0, 0(r3)
; CHECK-BE-NEXT: stxv vs1, 0(r4)
; CHECK-BE-NEXT: stxv vs2, 0(r5)
; CHECK-BE-NEXT: stxv vs3, 0(r6)
; CHECK-BE-NEXT: stxv vs0, 0(r4)
; CHECK-BE-NEXT: stxv vs0, 0(r5)
; CHECK-BE-NEXT: stxv vs0, 0(r6)
; CHECK-BE-NEXT: blr
entry:
%0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
Expand Down Expand Up @@ -540,14 +536,13 @@ define void @testRedundantPrimeUnprime(ptr %dst, <16 x i8> %vc) nounwind {
; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r0, 16(r1)
; CHECK-NEXT: stdu r1, -112(r1)
; CHECK-NEXT: xxsetaccz acc0
; CHECK-NEXT: xxsetaccz acc1
; CHECK-NEXT: mr r30, r3
; CHECK-NEXT: xxmfacc acc0
; CHECK-NEXT: xxlxor vs0, vs0, vs0
; CHECK-NEXT: stxv vs0, 48(r3)
; CHECK-NEXT: stxv vs1, 32(r3)
; CHECK-NEXT: stxv vs2, 16(r3)
; CHECK-NEXT: stxv vs3, 0(r3)
; CHECK-NEXT: stxv vs0, 32(r3)
; CHECK-NEXT: stxv vs0, 16(r3)
; CHECK-NEXT: stxv vs0, 0(r3)
; CHECK-NEXT: xvf32gerpp acc1, v2, v2
; CHECK-NEXT: xxmfacc acc1
; CHECK-NEXT: stxv vs4, 80(r1)
Expand All @@ -572,15 +567,14 @@ define void @testRedundantPrimeUnprime(ptr %dst, <16 x i8> %vc) nounwind {
; CHECK-BE-NEXT: mflr r0
; CHECK-BE-NEXT: std r0, 16(r1)
; CHECK-BE-NEXT: stdu r1, -192(r1)
; CHECK-BE-NEXT: xxsetaccz acc0
; CHECK-BE-NEXT: xxsetaccz acc1
; CHECK-BE-NEXT: std r30, 176(r1) # 8-byte Folded Spill
; CHECK-BE-NEXT: mr r30, r3
; CHECK-BE-NEXT: xxmfacc acc0
; CHECK-BE-NEXT: stxv vs1, 16(r3)
; CHECK-BE-NEXT: xxlxor vs0, vs0, vs0
; CHECK-BE-NEXT: stxv vs0, 16(r3)
; CHECK-BE-NEXT: stxv vs0, 0(r3)
; CHECK-BE-NEXT: stxv vs3, 48(r3)
; CHECK-BE-NEXT: stxv vs2, 32(r3)
; CHECK-BE-NEXT: stxv vs0, 48(r3)
; CHECK-BE-NEXT: stxv vs0, 32(r3)
; CHECK-BE-NEXT: xvf32gerpp acc1, v2, v2
; CHECK-BE-NEXT: xxmfacc acc1
; CHECK-BE-NEXT: stxv vs4, 112(r1)
Expand Down