diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td index e4d39134a4a25..a1f1a1707013f 100644 --- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td +++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td @@ -1661,6 +1661,22 @@ let TargetPrefix = "ppc" in { DefaultAttrsIntrinsic<[llvm_v1024i1_ty], [llvm_v1024i1_ty, llvm_v1024i1_ty], [IntrNoMem]>; + def int_ppc_mma_dmxxextfdmr512 : + DefaultAttrsIntrinsic<[llvm_v256i1_ty, llvm_v256i1_ty], [llvm_v1024i1_ty, + llvm_i32_ty], [IntrNoMem]>; + + def int_ppc_mma_dmxxinstdmr512 : + DefaultAttrsIntrinsic<[llvm_v1024i1_ty], [llvm_v1024i1_ty, llvm_v256i1_ty, + llvm_v256i1_ty, llvm_i32_ty], [IntrNoMem]>; + + def int_ppc_mma_dmxxextfdmr256 : + DefaultAttrsIntrinsic<[llvm_v256i1_ty], [llvm_v1024i1_ty, llvm_i32_ty], + [IntrNoMem]>; + + def int_ppc_mma_dmxxinstdmr256 : + DefaultAttrsIntrinsic<[llvm_v1024i1_ty], [llvm_v1024i1_ty, llvm_v256i1_ty, + llvm_i32_ty], [IntrNoMem]>; + // MMA Reduced-Precision: Outer Product Intrinsic Definitions. defm int_ppc_mma_xvi4ger8 : PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty]>; diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h index d6744014949ce..a088096c92a68 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h @@ -294,6 +294,10 @@ static inline bool isVFRegister(unsigned Reg) { static inline bool isVRRegister(unsigned Reg) { return Reg >= PPC::V0 && Reg <= PPC::V31; } + +static inline bool isDMRROWpRegister(unsigned Reg) { + return Reg >= PPC::DMRROWp0 && Reg <= PPC::DMRROWp31; +} } // namespace PPC } // namespace llvm diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 1f75425752a78..0800ed5dfce2c 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -11146,6 +11146,116 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getMergeValues(RetOps, dl); } + case Intrinsic::ppc_mma_dmxxextfdmr512: { + assert(Subtarget.isISAFuture() && "dmxxextfdmr512 requires ISA Future"); + auto *Idx = dyn_cast(Op.getOperand(2)); + assert(Idx && (Idx->getSExtValue() == 0 || Idx->getSExtValue() == 1) && + "Specify P of 0 or 1 for lower or upper 512 bytes"); + unsigned HiLo = Idx->getSExtValue(); + unsigned Opcode; + unsigned Subx; + if (HiLo == 0) { + Opcode = PPC::DMXXEXTFDMR512; + Subx = PPC::sub_wacc_lo; + } else { + Opcode = PPC::DMXXEXTFDMR512_HI; + Subx = PPC::sub_wacc_hi; + } + SDValue Subreg( + DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::v512i1, + Op.getOperand(1), + DAG.getTargetConstant(Subx, dl, MVT::i32)), + 0); + EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1}; + return SDValue(DAG.getMachineNode(Opcode, dl, ReturnTypes, Subreg), 0); + } + + case Intrinsic::ppc_mma_dmxxextfdmr256: { + assert(Subtarget.isISAFuture() && "dmxxextfdmr256 requires ISA Future"); + auto *Idx = dyn_cast(Op.getOperand(2)); + assert(Idx && (Idx->getSExtValue() >= 0 || Idx->getSExtValue() <= 3) && + "Specify a dmr row pair 0-3"); + unsigned IdxVal = Idx->getSExtValue(); + unsigned Subx; + switch (IdxVal) { + case 0: + Subx = PPC::sub_dmrrowp0; + break; + case 1: + Subx = PPC::sub_dmrrowp1; + break; + case 2: + Subx = PPC::sub_wacc_hi_then_sub_dmrrowp0; + break; + case 3: + Subx = PPC::sub_wacc_hi_then_sub_dmrrowp1; + break; + } + SDValue Subreg( + DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::v256i1, + Op.getOperand(1), + DAG.getTargetConstant(Subx, dl, MVT::i32)), + 0); + SDValue P = DAG.getTargetConstant(IdxVal, dl, MVT::i32); + return SDValue( + DAG.getMachineNode(PPC::DMXXEXTFDMR256, dl, MVT::v256i1, {Subreg, P}), + 0); + } + + case Intrinsic::ppc_mma_dmxxinstdmr512: { + assert(Subtarget.isISAFuture() && "dmxxinstdmr512 requires ISA Future"); + auto *Idx = dyn_cast(Op.getOperand(4)); + assert(Idx && (Idx->getSExtValue() == 0 || Idx->getSExtValue() == 1) && + "Specify P of 0 or 1 for lower or upper 512 bytes"); + unsigned HiLo = Idx->getSExtValue(); + unsigned Opcode; + unsigned Subx; + if (HiLo == 0) { + Opcode = PPC::DMXXINSTDMR512; + Subx = PPC::sub_wacc_lo; + } else { + Opcode = PPC::DMXXINSTDMR512_HI; + Subx = PPC::sub_wacc_hi; + } + SDValue Ops[] = {Op.getOperand(2), Op.getOperand(3)}; + SDValue Wacc = SDValue(DAG.getMachineNode(Opcode, dl, MVT::v512i1, Ops), 0); + SDValue SubReg = DAG.getTargetConstant(Subx, dl, MVT::i32); + return SDValue(DAG.getMachineNode(PPC::INSERT_SUBREG, dl, MVT::v1024i1, + Op.getOperand(1), Wacc, SubReg), + 0); + } + + case Intrinsic::ppc_mma_dmxxinstdmr256: { + assert(Subtarget.isISAFuture() && "dmxxinstdmr256 requires ISA Future"); + auto *Idx = dyn_cast(Op.getOperand(3)); + assert(Idx && (Idx->getSExtValue() >= 0 || Idx->getSExtValue() <= 3) && + "Specify a dmr row pair 0-3"); + unsigned IdxVal = Idx->getSExtValue(); + unsigned Subx; + switch (IdxVal) { + case 0: + Subx = PPC::sub_dmrrowp0; + break; + case 1: + Subx = PPC::sub_dmrrowp1; + break; + case 2: + Subx = PPC::sub_wacc_hi_then_sub_dmrrowp0; + break; + case 3: + Subx = PPC::sub_wacc_hi_then_sub_dmrrowp1; + break; + } + SDValue SubReg = DAG.getTargetConstant(Subx, dl, MVT::i32); + SDValue P = DAG.getTargetConstant(IdxVal, dl, MVT::i32); + SDValue Ops[] = {Op.getOperand(2), P}; + SDValue DMRRowp = SDValue( + DAG.getMachineNode(PPC::DMXXINSTDMR256, dl, MVT::v256i1, Ops), 0); + return SDValue(DAG.getMachineNode(PPC::INSERT_SUBREG, dl, MVT::v1024i1, + Op.getOperand(1), DMRRowp, SubReg), + 0); + } + case Intrinsic::ppc_mma_xxmfacc: case Intrinsic::ppc_mma_xxmtacc: { // Allow pre-isa-future subtargets to lower as normal. diff --git a/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp index 13cee8dd783bb..0a04b7fb8d169 100644 --- a/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp +++ b/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp @@ -196,6 +196,12 @@ bool llvm::LowerPPCMachineOperandToMCOperand(const MachineOperand &MO, assert(MO.getReg() > PPC::NoRegister && MO.getReg() < PPC::NUM_TARGET_REGS && "Invalid register for this target!"); + // ISA instructions refer to the containing dmr reg. + if (PPC::isDMRROWpRegister(MO.getReg())) { + OutMO = + MCOperand::createReg(PPC::DMR0 + (MO.getReg() - PPC::DMRROWp0) / 4); + return true; + } // Ignore all implicit register operands. if (MO.isImplicit()) return false; diff --git a/llvm/test/CodeGen/PowerPC/dmr-enable.ll b/llvm/test/CodeGen/PowerPC/dmr-enable.ll index a6c99a751e2c5..1e3014405ac4e 100644 --- a/llvm/test/CodeGen/PowerPC/dmr-enable.ll +++ b/llvm/test/CodeGen/PowerPC/dmr-enable.ll @@ -129,6 +129,248 @@ entry: ret void } +define void @text512(ptr %vp1, ptr %rp1, ptr %rp2, ptr %rp3, ptr %rp4) { +; CHECK-LABEL: text512: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: dmsetdmrz dmr0 +; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-NEXT: stxv v2, 16(r4) +; CHECK-NEXT: stxv v3, 0(r4) +; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1 +; CHECK-NEXT: stxv v2, 16(r6) +; CHECK-NEXT: stxv v3, 0(r6) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: text512: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: dmsetdmrz dmr0 +; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-NEXT: stxv v3, 16(r4) +; CHECK-BE-NEXT: stxv v2, 0(r4) +; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1 +; CHECK-BE-NEXT: stxv v3, 16(r6) +; CHECK-BE-NEXT: stxv v2, 0(r6) +; CHECK-BE-NEXT: blr +entry: + %z = call <1024 x i1> @llvm.ppc.mma.dmsetdmrz() + %x = call { <256 x i1>, <256 x i1> } @llvm.ppc.mma.dmxxextfdmr512(<1024 x i1> %z, i32 0) + %p = extractvalue { <256 x i1>, <256 x i1 > } %x, 0 + store <256 x i1> %p, ptr %rp1, align 16 + %y = call { <256 x i1>, <256 x i1> } @llvm.ppc.mma.dmxxextfdmr512(<1024 x i1> %z, i32 1) + %q = extractvalue { <256 x i1>, <256 x i1 > } %y, 0 + store <256 x i1> %q, ptr %rp3, align 16 + ret void +} + +define void @text256(ptr %vp1, ptr %rp1, ptr %rp2, ptr %rp3, ptr %rp4) { +; CHECK-LABEL: text256: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: dmsetdmrz dmr0 +; CHECK-NEXT: dmxxextfdmr256 vsp34, dmr0, 0 +; CHECK-NEXT: stxv v2, 16(r4) +; CHECK-NEXT: stxv v3, 0(r4) +; CHECK-NEXT: dmxxextfdmr256 vsp34, dmr0, 1 +; CHECK-NEXT: stxv v2, 16(r5) +; CHECK-NEXT: stxv v3, 0(r5) +; CHECK-NEXT: dmxxextfdmr256 vsp34, dmr0, 2 +; CHECK-NEXT: stxv v2, 16(r6) +; CHECK-NEXT: stxv v3, 0(r6) +; CHECK-NEXT: dmxxextfdmr256 vsp34, dmr0, 3 +; CHECK-NEXT: stxv v2, 16(r7) +; CHECK-NEXT: stxv v3, 0(r7) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: text256: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: dmsetdmrz dmr0 +; CHECK-BE-NEXT: dmxxextfdmr256 vsp34, dmr0, 0 +; CHECK-BE-NEXT: stxv v3, 16(r4) +; CHECK-BE-NEXT: stxv v2, 0(r4) +; CHECK-BE-NEXT: dmxxextfdmr256 vsp34, dmr0, 1 +; CHECK-BE-NEXT: stxv v3, 16(r5) +; CHECK-BE-NEXT: stxv v2, 0(r5) +; CHECK-BE-NEXT: dmxxextfdmr256 vsp34, dmr0, 2 +; CHECK-BE-NEXT: stxv v3, 16(r6) +; CHECK-BE-NEXT: stxv v2, 0(r6) +; CHECK-BE-NEXT: dmxxextfdmr256 vsp34, dmr0, 3 +; CHECK-BE-NEXT: stxv v3, 16(r7) +; CHECK-BE-NEXT: stxv v2, 0(r7) +; CHECK-BE-NEXT: blr +entry: + %z = call <1024 x i1> @llvm.ppc.mma.dmsetdmrz() + %x = call <256 x i1> @llvm.ppc.mma.dmxxextfdmr256(<1024 x i1> %z, i32 0) + store <256 x i1> %x, ptr %rp1, align 16 + %q = call <256 x i1> @llvm.ppc.mma.dmxxextfdmr256(<1024 x i1> %z, i32 1) + store <256 x i1> %q, ptr %rp2, align 16 + %w = call <256 x i1> @llvm.ppc.mma.dmxxextfdmr256(<1024 x i1> %z, i32 2) + store <256 x i1> %w, ptr %rp3, align 16 + %y = call <256 x i1> @llvm.ppc.mma.dmxxextfdmr256(<1024 x i1> %z, i32 3) + store <256 x i1> %y, ptr %rp4, align 16 + ret void +} + +define void @tins512(ptr %vp1, ptr %vp2, ptr %vp3, ptr %vp4, ptr %rp1, ptr %rp2) { +; CHECK-LABEL: tins512: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxv v2, 16(r3) +; CHECK-NEXT: lxv v3, 0(r3) +; CHECK-NEXT: lxv v4, 16(r4) +; CHECK-NEXT: lxv v5, 0(r4) +; CHECK-NEXT: dmsetdmrz dmr0 +; CHECK-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-NEXT: stxvp vsp34, 96(r7) +; CHECK-NEXT: stxvp vsp36, 64(r7) +; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1 +; CHECK-NEXT: stxvp vsp34, 32(r7) +; CHECK-NEXT: stxvp vsp36, 0(r7) +; CHECK-NEXT: lxv v2, 16(r5) +; CHECK-NEXT: lxv v4, 16(r6) +; CHECK-NEXT: lxv v3, 0(r5) +; CHECK-NEXT: lxv v5, 0(r6) +; CHECK-NEXT: dmxxinstdmr512 wacc_hi0, vsp34, vsp36, 1 +; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-NEXT: stxvp vsp34, 96(r8) +; CHECK-NEXT: stxvp vsp36, 64(r8) +; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1 +; CHECK-NEXT: stxvp vsp34, 32(r8) +; CHECK-NEXT: stxvp vsp36, 0(r8) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: tins512: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lxv v2, 0(r3) +; CHECK-BE-NEXT: lxv v3, 16(r3) +; CHECK-BE-NEXT: lxv v4, 0(r4) +; CHECK-BE-NEXT: lxv v5, 16(r4) +; CHECK-BE-NEXT: dmsetdmrz dmr0 +; CHECK-BE-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1 +; CHECK-BE-NEXT: stxvp vsp36, 96(r7) +; CHECK-BE-NEXT: stxvp vsp34, 64(r7) +; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-NEXT: stxvp vsp36, 32(r7) +; CHECK-BE-NEXT: stxvp vsp34, 0(r7) +; CHECK-BE-NEXT: lxv v2, 0(r5) +; CHECK-BE-NEXT: lxv v4, 0(r6) +; CHECK-BE-NEXT: lxv v3, 16(r5) +; CHECK-BE-NEXT: lxv v5, 16(r6) +; CHECK-BE-NEXT: dmxxinstdmr512 wacc_hi0, vsp34, vsp36, 1 +; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1 +; CHECK-BE-NEXT: stxvp vsp36, 96(r8) +; CHECK-BE-NEXT: stxvp vsp34, 64(r8) +; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-NEXT: stxvp vsp36, 32(r8) +; CHECK-BE-NEXT: stxvp vsp34, 0(r8) +; CHECK-BE-NEXT: blr +entry: + %z = call <1024 x i1> @llvm.ppc.mma.dmsetdmrz() + %l1 = load <256 x i1>, ptr %vp1, align 16 + %r1 = load <256 x i1>, ptr %vp2, align 16 + %a = call <1024 x i1> @llvm.ppc.mma.dmxxinstdmr512(<1024 x i1> %z, <256 x i1> %l1, <256 x i1> %r1, i32 0) + store <1024 x i1> %a, ptr %rp1, align 16 + %l2 = load <256 x i1>, ptr %vp3, align 16 + %r2 = load <256 x i1>, ptr %vp4, align 16 + %b = call <1024 x i1> @llvm.ppc.mma.dmxxinstdmr512(<1024 x i1> %a, <256 x i1> %l2, <256 x i1> %r2, i32 1) + store <1024 x i1> %b, ptr %rp2, align 16 + ret void +} + +define void @tins256(ptr %vp1, ptr %vp2, ptr %vp3, ptr %vp4, ptr %rp1, ptr %rp2, ptr %rp3, ptr %rp4) { +; CHECK-LABEL: tins256: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxv v2, 16(r3) +; CHECK-NEXT: lxv v3, 0(r3) +; CHECK-NEXT: dmsetdmrz dmr0 +; CHECK-NEXT: dmxxinstdmr256 dmr0, vsp34, 0 +; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-NEXT: stxvp vsp34, 96(r7) +; CHECK-NEXT: stxvp vsp36, 64(r7) +; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1 +; CHECK-NEXT: stxvp vsp34, 32(r7) +; CHECK-NEXT: stxvp vsp36, 0(r7) +; CHECK-NEXT: lxv v2, 16(r4) +; CHECK-NEXT: lxv v3, 0(r4) +; CHECK-NEXT: dmxxinstdmr256 dmr0, vsp34, 1 +; CHECK-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc0, 0 +; CHECK-NEXT: stxvp vsp36, 96(r8) +; CHECK-NEXT: stxvp vsp32, 64(r8) +; CHECK-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc_hi0, 1 +; CHECK-NEXT: stxvp vsp36, 32(r8) +; CHECK-NEXT: stxvp vsp32, 0(r8) +; CHECK-NEXT: dmxxinstdmr256 dmr0, vsp34, 2 +; CHECK-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc0, 0 +; CHECK-NEXT: stxvp vsp36, 96(r9) +; CHECK-NEXT: stxvp vsp32, 64(r9) +; CHECK-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc_hi0, 1 +; CHECK-NEXT: stxvp vsp36, 32(r9) +; CHECK-NEXT: stxvp vsp32, 0(r9) +; CHECK-NEXT: dmxxinstdmr256 dmr0, vsp34, 3 +; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-NEXT: stxvp vsp34, 96(r10) +; CHECK-NEXT: stxvp vsp36, 64(r10) +; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1 +; CHECK-NEXT: stxvp vsp34, 32(r10) +; CHECK-NEXT: stxvp vsp36, 0(r10) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: tins256: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lxv v2, 0(r3) +; CHECK-BE-NEXT: lxv v3, 16(r3) +; CHECK-BE-NEXT: dmsetdmrz dmr0 +; CHECK-BE-NEXT: dmxxinstdmr256 dmr0, vsp34, 0 +; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1 +; CHECK-BE-NEXT: stxvp vsp36, 96(r7) +; CHECK-BE-NEXT: stxvp vsp34, 64(r7) +; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-NEXT: stxvp vsp36, 32(r7) +; CHECK-BE-NEXT: stxvp vsp34, 0(r7) +; CHECK-BE-NEXT: lxv v2, 0(r4) +; CHECK-BE-NEXT: lxv v3, 16(r4) +; CHECK-BE-NEXT: dmxxinstdmr256 dmr0, vsp34, 1 +; CHECK-BE-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc_hi0, 1 +; CHECK-BE-NEXT: stxvp vsp32, 96(r8) +; CHECK-BE-NEXT: stxvp vsp36, 64(r8) +; CHECK-BE-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc0, 0 +; CHECK-BE-NEXT: stxvp vsp32, 32(r8) +; CHECK-BE-NEXT: stxvp vsp36, 0(r8) +; CHECK-BE-NEXT: dmxxinstdmr256 dmr0, vsp34, 2 +; CHECK-BE-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc_hi0, 1 +; CHECK-BE-NEXT: stxvp vsp32, 96(r9) +; CHECK-BE-NEXT: stxvp vsp36, 64(r9) +; CHECK-BE-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc0, 0 +; CHECK-BE-NEXT: stxvp vsp32, 32(r9) +; CHECK-BE-NEXT: stxvp vsp36, 0(r9) +; CHECK-BE-NEXT: dmxxinstdmr256 dmr0, vsp34, 3 +; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1 +; CHECK-BE-NEXT: stxvp vsp36, 96(r10) +; CHECK-BE-NEXT: stxvp vsp34, 64(r10) +; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-NEXT: stxvp vsp36, 32(r10) +; CHECK-BE-NEXT: stxvp vsp34, 0(r10) +; CHECK-BE-NEXT: blr +entry: + %z = call <1024 x i1> @llvm.ppc.mma.dmsetdmrz() + %l1 = load <256 x i1>, ptr %vp1, align 16 + %a = call <1024 x i1> @llvm.ppc.mma.dmxxinstdmr256(<1024 x i1> %z, <256 x i1> %l1, i32 0) + store <1024 x i1> %a, ptr %rp1, align 16 + %l2 = load <256 x i1>, ptr %vp2, align 16 + %b = call <1024 x i1> @llvm.ppc.mma.dmxxinstdmr256(<1024 x i1> %a, <256 x i1> %l2, i32 1) + store <1024 x i1> %b, ptr %rp2, align 16 + %l3 = load <256 x i1>, ptr %vp3, align 16 + %c = call <1024 x i1> @llvm.ppc.mma.dmxxinstdmr256(<1024 x i1> %b, <256 x i1> %l2, i32 2) + store <1024 x i1> %c, ptr %rp3, align 16 + %l4 = load <256 x i1>, ptr %vp4, align 16 + %d = call <1024 x i1> @llvm.ppc.mma.dmxxinstdmr256(<1024 x i1> %c, <256 x i1> %l2, i32 3) + store <1024 x i1> %d, ptr %rp4, align 16 + ret void +} + declare <1024 x i1> @llvm.ppc.mma.dmsetdmrz() declare <1024 x i1> @llvm.ppc.mma.dmmr(<1024 x i1>) declare <1024 x i1> @llvm.ppc.mma.dmxor(<1024 x i1>, <1024 x i1>) +declare <1024 x i1> @llvm.ppc.mma.dmxxinstdmr512(<1024 x i1>, <256 x i1>, <256 x i1>, i32) +declare <1024 x i1> @llvm.ppc.mma.dmxxinstdmr256(<1024 x i1>, <256 x i1>, i32) +declare { <256 x i1>, <256 x i1> } @llvm.ppc.mma.dmxxextfdmr512(<1024 x i1>, i32) +declare <256 x i1> @llvm.ppc.mma.dmxxextfdmr256(<1024 x i1>, i32)