diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 9d4d2d864fc32..57c86d9e5de64 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -1921,6 +1921,14 @@ unsigned PPCInstrInfo::getSpillIndex(const TargetRegisterClass *RC) const { OpcodeIndex = SOK_PairedVecSpill; } else if (PPC::G8pRCRegClass.hasSubClassEq(RC)) { OpcodeIndex = SOK_PairedG8Spill; + } else if (PPC::DMRROWRCRegClass.hasSubClassEq(RC)) { + llvm_unreachable("TODO: Implement spill DMRROW regclass!"); + } else if (PPC::DMRROWpRCRegClass.hasSubClassEq(RC)) { + llvm_unreachable("TODO: Implement spill DMRROWp regclass!"); + } else if (PPC::DMRpRCRegClass.hasSubClassEq(RC)) { + llvm_unreachable("TODO: Implement spill DMRp regclass!"); + } else if (PPC::DMRRCRegClass.hasSubClassEq(RC)) { + OpcodeIndex = SOK_DMRSpill; } else { llvm_unreachable("Unknown regclass!"); } diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h index d0db90b029f61..a27b5718ec89d 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -81,6 +81,7 @@ enum SpillOpcodeKey { SOK_AccumulatorSpill, SOK_UAccumulatorSpill, SOK_WAccumulatorSpill, + SOK_DMRSpill, SOK_SPESpill, SOK_PairedG8Spill, SOK_LastOpcodeSpill // This must be last on the enum. @@ -117,6 +118,7 @@ enum PPCMachineCombinerPattern : unsigned { NoInstr, \ NoInstr, \ NoInstr, \ + NoInstr, \ PPC::EVLDD, \ PPC::RESTORE_QUADWORD} @@ -137,6 +139,7 @@ enum PPCMachineCombinerPattern : unsigned { NoInstr, \ NoInstr, \ NoInstr, \ + NoInstr, \ PPC::RESTORE_QUADWORD} #define Pwr10LoadOpcodes \ @@ -156,6 +159,7 @@ enum PPCMachineCombinerPattern : unsigned { PPC::RESTORE_UACC, \ NoInstr, \ NoInstr, \ + NoInstr, \ PPC::RESTORE_QUADWORD} #define FutureLoadOpcodes \ @@ -174,6 +178,7 @@ enum PPCMachineCombinerPattern : unsigned { PPC::RESTORE_ACC, \ PPC::RESTORE_UACC, \ PPC::RESTORE_WACC, \ + PPC::RESTORE_DMR, \ NoInstr, \ PPC::RESTORE_QUADWORD} @@ -193,6 +198,7 @@ enum PPCMachineCombinerPattern : unsigned { NoInstr, \ NoInstr, \ NoInstr, \ + NoInstr, \ PPC::EVSTDD, \ PPC::SPILL_QUADWORD} @@ -213,6 +219,7 @@ enum PPCMachineCombinerPattern : unsigned { NoInstr, \ NoInstr, \ NoInstr, \ + NoInstr, \ PPC::SPILL_QUADWORD} #define Pwr10StoreOpcodes \ @@ -232,6 +239,7 @@ enum PPCMachineCombinerPattern : unsigned { PPC::SPILL_UACC, \ NoInstr, \ NoInstr, \ + NoInstr, \ PPC::SPILL_QUADWORD} #define FutureStoreOpcodes \ @@ -250,6 +258,7 @@ enum PPCMachineCombinerPattern : unsigned { PPC::SPILL_ACC, \ PPC::SPILL_UACC, \ PPC::SPILL_WACC, \ + PPC::SPILL_DMR, \ NoInstr, \ PPC::SPILL_QUADWORD} diff --git a/llvm/lib/Target/PowerPC/PPCInstrMMA.td b/llvm/lib/Target/PowerPC/PPCInstrMMA.td index 85a7c8e04c74c..82e4a60e0a728 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrMMA.td +++ b/llvm/lib/Target/PowerPC/PPCInstrMMA.td @@ -565,10 +565,14 @@ let Predicates = [MMA, IsISAFuture], isCodeGenOnly = 1 in { let mayStore = 1 in { def SPILL_WACC: PPCEmitTimePseudo<(outs), (ins wacc:$AT, memrix16:$dst), "#SPILL_WACC", []>; + def SPILL_DMR: PPCEmitTimePseudo<(outs), (ins dmr:$AT, memrix16:$dst), + "#SPILL_DMR", []>; } let mayLoad = 1, hasSideEffects = 0 in { def RESTORE_WACC: PPCEmitTimePseudo<(outs wacc:$AT), (ins memrix16:$src), "#RESTORE_WACC", []>; + def RESTORE_DMR: PPCEmitTimePseudo<(outs dmr:$AT), (ins memrix16:$src), + "#RESTORE_DMR", []>; } } diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp index 51902ad218d1c..45183af0b7984 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -1509,6 +1509,95 @@ void PPCRegisterInfo::lowerQuadwordRestore(MachineBasicBlock::iterator II, MBB.erase(II); } +/// lowerDMRSpilling - Generate the code for spilling the DMR register. +void PPCRegisterInfo::lowerDMRSpilling(MachineBasicBlock::iterator II, + unsigned FrameIndex) const { + MachineInstr &MI = *II; // SPILL_DMR , + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const PPCSubtarget &Subtarget = MF.getSubtarget(); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); + bool IsLittleEndian = Subtarget.isLittleEndian(); + + // DMR is made up of WACC and WACC_HI, so DMXXEXTFDMR512 to spill + // the corresponding 512 bits. + const TargetRegisterClass *RC = &PPC::VSRpRCRegClass; + Register SrcReg = MI.getOperand(0).getReg(); + + Register VSRpReg0 = MF.getRegInfo().createVirtualRegister(RC); + Register VSRpReg1 = MF.getRegInfo().createVirtualRegister(RC); + Register VSRpReg2 = MF.getRegInfo().createVirtualRegister(RC); + Register VSRpReg3 = MF.getRegInfo().createVirtualRegister(RC); + + BuildMI(MBB, II, DL, TII.get(PPC::DMXXEXTFDMR512_HI), VSRpReg2) + .addDef(VSRpReg3) + .addReg(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_wacc_hi)); + + BuildMI(MBB, II, DL, TII.get(PPC::DMXXEXTFDMR512), VSRpReg0) + .addDef(VSRpReg1) + .addReg(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_wacc_lo)); + + addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP)) + .addReg(VSRpReg0, RegState::Kill), + FrameIndex, IsLittleEndian ? 96 : 0); + addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP)) + .addReg(VSRpReg1, RegState::Kill), + FrameIndex, IsLittleEndian ? 64 : 32); + addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP)) + .addReg(VSRpReg2, RegState::Kill), + FrameIndex, IsLittleEndian ? 32 : 64); + addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP)) + .addReg(VSRpReg3, RegState::Kill), + FrameIndex, IsLittleEndian ? 0 : 96); + + // Discard the pseudo instruction. + MBB.erase(II); +} + +/// lowerDMRRestore - Generate the code to restore the DMR register. +void PPCRegisterInfo::lowerDMRRestore(MachineBasicBlock::iterator II, + unsigned FrameIndex) const { + MachineInstr &MI = *II; // = RESTORE_WACC + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const PPCSubtarget &Subtarget = MF.getSubtarget(); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); + bool IsLittleEndian = Subtarget.isLittleEndian(); + + const TargetRegisterClass *RC = &PPC::VSRpRCRegClass; + Register DestReg = MI.getOperand(0).getReg(); + + Register VSRpReg0 = MF.getRegInfo().createVirtualRegister(RC); + Register VSRpReg1 = MF.getRegInfo().createVirtualRegister(RC); + Register VSRpReg2 = MF.getRegInfo().createVirtualRegister(RC); + Register VSRpReg3 = MF.getRegInfo().createVirtualRegister(RC); + + addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::LXVP), VSRpReg0), + FrameIndex, IsLittleEndian ? 96 : 0); + addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::LXVP), VSRpReg1), + FrameIndex, IsLittleEndian ? 64 : 32); + addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::LXVP), VSRpReg2), + FrameIndex, IsLittleEndian ? 32 : 64); + addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::LXVP), VSRpReg3), + FrameIndex, IsLittleEndian ? 0 : 96); + + // Kill virtual registers (killedRegState::Killed). + BuildMI(MBB, II, DL, TII.get(PPC::DMXXINSTDMR512_HI), + TargetRegisterInfo::getSubReg(DestReg, PPC::sub_wacc_hi)) + .addReg(VSRpReg2, RegState::Kill) + .addReg(VSRpReg3, RegState::Kill); + + BuildMI(MBB, II, DL, TII.get(PPC::DMXXINSTDMR512), + TargetRegisterInfo::getSubReg(DestReg, PPC::sub_wacc_lo)) + .addReg(VSRpReg0, RegState::Kill) + .addReg(VSRpReg1, RegState::Kill); + + // Discard the pseudo instruction. + MBB.erase(II); +} + bool PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF, Register Reg, int &FrameIdx) const { // For the nonvolatile condition registers (CR2, CR3, CR4) return true to @@ -1671,6 +1760,12 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, case PPC::RESTORE_WACC: lowerWACCRestore(II, FrameIndex); return true; + case PPC::SPILL_DMR: + lowerDMRSpilling(II, FrameIndex); + return true; + case PPC::RESTORE_DMR: + lowerDMRRestore(II, FrameIndex); + return true; case PPC::SPILL_QUADWORD: lowerQuadwordSpilling(II, FrameIndex); return true; diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h index 005d890c57c93..4b66ece534112 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h @@ -151,6 +151,11 @@ class PPCRegisterInfo : public PPCGenRegisterInfo { void lowerQuadwordRestore(MachineBasicBlock::iterator II, unsigned FrameIndex) const; + void lowerDMRSpilling(MachineBasicBlock::iterator II, + unsigned FrameIndex) const; + void lowerDMRRestore(MachineBasicBlock::iterator II, + unsigned FrameIndex) const; + static void emitAccCopyInfo(MachineBasicBlock &MBB, MCRegister DestReg, MCRegister SrcReg); diff --git a/llvm/test/CodeGen/PowerPC/dmr-spill.ll b/llvm/test/CodeGen/PowerPC/dmr-spill.ll new file mode 100644 index 0000000000000..b224643a6dd9f --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/dmr-spill.ll @@ -0,0 +1,151 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -disable-auto-paired-vec-st=false -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mcpu=future < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-aix \ +; RUN: -disable-auto-paired-vec-st=false -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mcpu=future < %s | FileCheck %s --check-prefix=AIX +; RUN: llc -verify-machineinstrs -mtriple=powerpc-unknown-aix \ +; RUN: -disable-auto-paired-vec-st=false -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mcpu=future < %s | FileCheck %s --check-prefix=AIX32 + +declare <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2pp(<1024 x i1>, <256 x i1>, <16 x i8>) +declare void @dummy_func() + +define void @spillDMRreg(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) nounwind { +; CHECK-LABEL: spillDMRreg: +; CHECK: # %bb.0: +; CHECK-NEXT: mflr r0 +; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r0, 16(r1) +; CHECK-NEXT: stdu r1, -176(r1) +; CHECK-NEXT: lxvp vsp34, 0(r3) +; CHECK-NEXT: lxvp vsp36, 32(r3) +; CHECK-NEXT: mr r30, r6 +; CHECK-NEXT: dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1 +; CHECK-NEXT: lxvp vsp34, 64(r3) +; CHECK-NEXT: lxvp vsp36, 96(r3) +; CHECK-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-NEXT: lxv v2, 16(r4) +; CHECK-NEXT: lxv v3, 0(r4) +; CHECK-NEXT: lxv vs0, 0(r5) +; CHECK-NEXT: dmxvbf16gerx2pp dmr0, vsp34, vs0 +; CHECK-NEXT: dmxxextfdmr512 vsp36, vsp34, wacc_hi0, 1 +; CHECK-NEXT: dmxxextfdmr512 vsp38, vsp32, wacc0, 0 +; CHECK-NEXT: stxvp vsp38, 128(r1) +; CHECK-NEXT: stxvp vsp32, 96(r1) +; CHECK-NEXT: stxvp vsp36, 64(r1) +; CHECK-NEXT: stxvp vsp34, 32(r1) +; CHECK-NEXT: bl dummy_func@notoc +; CHECK-NEXT: lxvp vsp34, 128(r1) +; CHECK-NEXT: lxvp vsp36, 96(r1) +; CHECK-NEXT: lxvp vsp32, 64(r1) +; CHECK-NEXT: lxvp vsp38, 32(r1) +; CHECK-NEXT: dmxxinstdmr512 wacc_hi0, vsp32, vsp38, 1 +; CHECK-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-NEXT: stxvp vsp34, 96(r30) +; CHECK-NEXT: stxvp vsp36, 64(r30) +; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1 +; CHECK-NEXT: stxvp vsp34, 32(r30) +; CHECK-NEXT: stxvp vsp36, 0(r30) +; CHECK-NEXT: addi r1, r1, 176 +; CHECK-NEXT: ld r0, 16(r1) +; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; CHECK-NEXT: mtlr r0 +; CHECK-NEXT: blr +; +; AIX-LABEL: spillDMRreg: +; AIX: # %bb.0: +; AIX-NEXT: mflr r0 +; AIX-NEXT: std r0, 16(r1) +; AIX-NEXT: stdu r1, -256(r1) +; AIX-NEXT: std r31, 248(r1) # 8-byte Folded Spill +; AIX-NEXT: lxvp vsp34, 96(r3) +; AIX-NEXT: lxvp vsp36, 64(r3) +; AIX-NEXT: mr r31, r6 +; AIX-NEXT: dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1 +; AIX-NEXT: lxvp vsp34, 32(r3) +; AIX-NEXT: lxvp vsp36, 0(r3) +; AIX-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 +; AIX-NEXT: lxv v2, 0(r4) +; AIX-NEXT: lxv v3, 16(r4) +; AIX-NEXT: lxv vs0, 0(r5) +; AIX-NEXT: dmxvbf16gerx2pp dmr0, vsp34, vs0 +; AIX-NEXT: dmxxextfdmr512 vsp36, vsp34, wacc_hi0, 1 +; AIX-NEXT: dmxxextfdmr512 vsp38, vsp32, wacc0, 0 +; AIX-NEXT: stxvp vsp38, 112(r1) +; AIX-NEXT: stxvp vsp32, 144(r1) +; AIX-NEXT: stxvp vsp36, 176(r1) +; AIX-NEXT: stxvp vsp34, 208(r1) +; AIX-NEXT: bl .dummy_func[PR] +; AIX-NEXT: nop +; AIX-NEXT: lxvp vsp34, 112(r1) +; AIX-NEXT: lxvp vsp36, 144(r1) +; AIX-NEXT: lxvp vsp32, 176(r1) +; AIX-NEXT: lxvp vsp38, 208(r1) +; AIX-NEXT: dmxxinstdmr512 wacc_hi0, vsp32, vsp38, 1 +; AIX-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 +; AIX-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1 +; AIX-NEXT: stxvp vsp36, 96(r31) +; AIX-NEXT: stxvp vsp34, 64(r31) +; AIX-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; AIX-NEXT: stxvp vsp36, 32(r31) +; AIX-NEXT: stxvp vsp34, 0(r31) +; AIX-NEXT: ld r31, 248(r1) # 8-byte Folded Reload +; AIX-NEXT: addi r1, r1, 256 +; AIX-NEXT: ld r0, 16(r1) +; AIX-NEXT: mtlr r0 +; AIX-NEXT: blr +; +; AIX32-LABEL: spillDMRreg: +; AIX32: # %bb.0: +; AIX32-NEXT: mflr r0 +; AIX32-NEXT: stw r0, 8(r1) +; AIX32-NEXT: stwu r1, -208(r1) +; AIX32-NEXT: stw r31, 204(r1) # 4-byte Folded Spill +; AIX32-NEXT: lxvp vsp34, 96(r3) +; AIX32-NEXT: lxvp vsp36, 64(r3) +; AIX32-NEXT: mr r31, r6 +; AIX32-NEXT: dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1 +; AIX32-NEXT: lxvp vsp34, 32(r3) +; AIX32-NEXT: lxvp vsp36, 0(r3) +; AIX32-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 +; AIX32-NEXT: lxv v2, 0(r4) +; AIX32-NEXT: lxv v3, 16(r4) +; AIX32-NEXT: lxv vs0, 0(r5) +; AIX32-NEXT: dmxvbf16gerx2pp dmr0, vsp34, vs0 +; AIX32-NEXT: dmxxextfdmr512 vsp36, vsp34, wacc_hi0, 1 +; AIX32-NEXT: dmxxextfdmr512 vsp38, vsp32, wacc0, 0 +; AIX32-NEXT: stxvp vsp38, 64(r1) +; AIX32-NEXT: stxvp vsp32, 96(r1) +; AIX32-NEXT: stxvp vsp36, 128(r1) +; AIX32-NEXT: stxvp vsp34, 160(r1) +; AIX32-NEXT: bl .dummy_func[PR] +; AIX32-NEXT: nop +; AIX32-NEXT: lxvp vsp34, 64(r1) +; AIX32-NEXT: lxvp vsp36, 96(r1) +; AIX32-NEXT: lxvp vsp32, 128(r1) +; AIX32-NEXT: lxvp vsp38, 160(r1) +; AIX32-NEXT: dmxxinstdmr512 wacc_hi0, vsp32, vsp38, 1 +; AIX32-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 +; AIX32-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1 +; AIX32-NEXT: stxvp vsp36, 96(r31) +; AIX32-NEXT: stxvp vsp34, 64(r31) +; AIX32-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; AIX32-NEXT: stxvp vsp36, 32(r31) +; AIX32-NEXT: stxvp vsp34, 0(r31) +; AIX32-NEXT: lwz r31, 204(r1) # 4-byte Folded Reload +; AIX32-NEXT: addi r1, r1, 208 +; AIX32-NEXT: lwz r0, 8(r1) +; AIX32-NEXT: mtlr r0 +; AIX32-NEXT: blr + %v.dmr = load <1024 x i1>, ptr %vop, align 64 + %v1 = load <256 x i1>, ptr %vpp, align 32 + %v2 = load <16 x i8>, ptr %vcp, align 32 + %call = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2pp(<1024 x i1> %v.dmr, <256 x i1> %v1, <16 x i8> %v2) + tail call void @dummy_func() + %call2 = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2pp(<1024 x i1> %v.dmr, <256 x i1> %v1, <16 x i8> %v2) + store <1024 x i1> %call, ptr %resp, align 64 + ret void +}