From b3b84164de8bb9be64787f660e05c1d7768aa04b Mon Sep 17 00:00:00 2001 From: Roland Froese Date: Wed, 12 Feb 2025 20:51:33 +0000 Subject: [PATCH 1/3] custom lower v1024i1 load/store --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 86 ++++++++++++-- .../test/CodeGen/PowerPC/mmaplus-acc-spill.ll | 8 +- .../CodeGen/PowerPC/mmaplus-intrinsics.ll | 106 +++++++++--------- llvm/test/CodeGen/PowerPC/v1024ls.ll | 65 +++++++++++ 4 files changed, 197 insertions(+), 68 deletions(-) create mode 100644 llvm/test/CodeGen/PowerPC/v1024ls.ll diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index bdc1ac7c7da58..300fa716297bd 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1363,6 +1363,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::STORE, MVT::v512i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v512i1, Custom); } + if (Subtarget.isISAFuture()) { + setOperationAction(ISD::LOAD, MVT::v1024i1, Custom); + setOperationAction(ISD::STORE, MVT::v1024i1, Custom); + addRegisterClass(MVT::v1024i1, &PPC::DMRRCRegClass); + } if (Subtarget.has64BitSupport()) setOperationAction(ISD::PREFETCH, MVT::Other, Legal); @@ -11766,9 +11771,13 @@ SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op, SDValue BasePtr = LN->getBasePtr(); EVT VT = Op.getValueType(); - if (VT != MVT::v256i1 && VT != MVT::v512i1) + if (VT != MVT::v256i1 && VT != MVT::v512i1 && VT != MVT::v1024i1) return Op; + // Used for dense math registers. + assert((VT != MVT::v1024i1 || Subtarget.isISAFuture()) && + "Type unsupported for this processor"); + // Type v256i1 is used for pairs and v512i1 is used for accumulators. // Here we create 2 or 4 v16i8 loads to load the pair or accumulator value in // 2 or 4 vsx registers. @@ -11796,9 +11805,36 @@ SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op, std::reverse(LoadChains.begin(), LoadChains.end()); } SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); - SDValue Value = - DAG.getNode(VT == MVT::v512i1 ? PPCISD::ACC_BUILD : PPCISD::PAIR_BUILD, - dl, VT, Loads); + SDValue Value; + if (VT == MVT::v1024i1) { + SmallVector Pairs; + SDValue Vsx0Idx = DAG.getTargetConstant(PPC::sub_vsx0, dl, MVT::i32); + SDValue Vsx1Idx = DAG.getTargetConstant(PPC::sub_vsx1, dl, MVT::i32); + SDValue VSRpRC = DAG.getTargetConstant(PPC::VSRpRCRegClassID, dl, MVT::i32); + NumVecs >>= 1; + for (unsigned Idx = 0; Idx < NumVecs; ++Idx) { + const SDValue Ops[] = {VSRpRC, Loads[Idx * 2], Vsx0Idx, + Loads[Idx * 2 + 1], Vsx1Idx}; + Pairs.push_back(SDValue( + DAG.getMachineNode(PPC::REG_SEQUENCE, dl, MVT::v256i1, Ops), 0)); + } + SDValue Lo(DAG.getMachineNode(PPC::DMXXINSTFDMR512, dl, MVT::v512i1, + Pairs[0], Pairs[1]), + 0); + SDValue LoSub = DAG.getTargetConstant(PPC::sub_wacc_lo, dl, MVT::i32); + SDValue Hi(DAG.getMachineNode(PPC::DMXXINSTFDMR512_HI, dl, MVT::v512i1, + Pairs[2], Pairs[3]), + 0); + SDValue HiSub = DAG.getTargetConstant(PPC::sub_wacc_hi, dl, MVT::i32); + SDValue RC = DAG.getTargetConstant(PPC::DMRRCRegClassID, dl, MVT::i32); + const SDValue Ops[] = {RC, Lo, LoSub, Hi, HiSub}; + Value = SDValue( + DAG.getMachineNode(PPC::REG_SEQUENCE, dl, MVT::v1024i1, Ops), 0); + } else { + Value = + DAG.getNode(VT == MVT::v512i1 ? PPCISD::ACC_BUILD : PPCISD::PAIR_BUILD, + dl, VT, Loads); + } SDValue RetOps[] = {Value, TF}; return DAG.getMergeValues(RetOps, dl); } @@ -11810,12 +11846,17 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, SDValue StoreChain = SN->getChain(); SDValue BasePtr = SN->getBasePtr(); SDValue Value = SN->getValue(); - SDValue Value2 = SN->getValue(); EVT StoreVT = Value.getValueType(); + SmallVector ValueVec; - if (StoreVT != MVT::v256i1 && StoreVT != MVT::v512i1) + if (StoreVT != MVT::v256i1 && StoreVT != MVT::v512i1 && + StoreVT != MVT::v1024i1) return Op; + // Used for dense math registers. + assert((StoreVT != MVT::v1024i1 || Subtarget.isISAFuture()) && + "Type unsupported for this processor"); + // Type v256i1 is used for pairs and v512i1 is used for accumulators. // Here we create 2 or 4 v16i8 stores to store the pair or accumulator // underlying registers individually. @@ -11832,20 +11873,43 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, MachineSDNode *ExtNode = DAG.getMachineNode( PPC::DMXXEXTFDMR512, dl, ReturnTypes, Op.getOperand(1)); - Value = SDValue(ExtNode, 0); - Value2 = SDValue(ExtNode, 1); + ValueVec.push_back(SDValue(ExtNode, 0)); + ValueVec.push_back(SDValue(ExtNode, 1)); } else Value = DAG.getNode(PPCISD::XXMFACC, dl, MVT::v512i1, Value); NumVecs = 4; + + } else if (StoreVT == MVT::v1024i1) { + SDValue Lo(DAG.getMachineNode( + TargetOpcode::EXTRACT_SUBREG, dl, MVT::v512i1, + Op.getOperand(1), + DAG.getTargetConstant(PPC::sub_wacc_lo, dl, MVT::i32)), + 0); + SDValue Hi(DAG.getMachineNode( + TargetOpcode::EXTRACT_SUBREG, dl, MVT::v512i1, + Op.getOperand(1), + DAG.getTargetConstant(PPC::sub_wacc_hi, dl, MVT::i32)), + 0); + EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1}; + MachineSDNode *ExtNode = + DAG.getMachineNode(PPC::DMXXEXTFDMR512, dl, ReturnTypes, Lo); + ValueVec.push_back(SDValue(ExtNode, 0)); + ValueVec.push_back(SDValue(ExtNode, 1)); + ExtNode = DAG.getMachineNode(PPC::DMXXEXTFDMR512_HI, dl, ReturnTypes, Hi); + ValueVec.push_back(SDValue(ExtNode, 0)); + ValueVec.push_back(SDValue(ExtNode, 1)); + NumVecs = 8; } for (unsigned Idx = 0; Idx < NumVecs; ++Idx) { unsigned VecNum = Subtarget.isLittleEndian() ? NumVecs - 1 - Idx : Idx; SDValue Elt; if (Subtarget.isISAFuture()) { VecNum = Subtarget.isLittleEndian() ? 1 - (Idx % 2) : (Idx % 2); - Elt = DAG.getNode(PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, - Idx > 1 ? Value2 : Value, - DAG.getConstant(VecNum, dl, getPointerTy(DAG.getDataLayout()))); + unsigned Pairx = + Subtarget.isLittleEndian() ? (NumVecs - Idx - 1) / 2 : Idx / 2; + Elt = DAG.getNode( + PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, ValueVec[Pairx], + DAG.getConstant(VecNum, dl, getPointerTy(DAG.getDataLayout()))); } else Elt = DAG.getNode(PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, Value, DAG.getConstant(VecNum, dl, getPointerTy(DAG.getDataLayout()))); diff --git a/llvm/test/CodeGen/PowerPC/mmaplus-acc-spill.ll b/llvm/test/CodeGen/PowerPC/mmaplus-acc-spill.ll index 5ca8c7b02cab4..c8ead89f96d66 100644 --- a/llvm/test/CodeGen/PowerPC/mmaplus-acc-spill.ll +++ b/llvm/test/CodeGen/PowerPC/mmaplus-acc-spill.ll @@ -46,10 +46,10 @@ define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i ; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp34, vsp36, 0 ; CHECK-NEXT: xvf16ger2pp wacc0, v28, v30 ; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 -; CHECK-NEXT: stxv v4, 48(r30) -; CHECK-NEXT: stxv v5, 32(r30) -; CHECK-NEXT: stxv v2, 16(r30) -; CHECK-NEXT: stxv v3, 0(r30) +; CHECK-NEXT: stxv v2, 48(r30) +; CHECK-NEXT: stxv v3, 32(r30) +; CHECK-NEXT: stxv v4, 16(r30) +; CHECK-NEXT: stxv v5, 0(r30) ; CHECK-NEXT: lxv v31, 144(r1) # 16-byte Folded Reload ; CHECK-NEXT: lxv v30, 128(r1) # 16-byte Folded Reload ; CHECK-NEXT: lxv v29, 112(r1) # 16-byte Folded Reload diff --git a/llvm/test/CodeGen/PowerPC/mmaplus-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mmaplus-intrinsics.ll index 158ec7a3427c8..b3e4392b8d0e3 100644 --- a/llvm/test/CodeGen/PowerPC/mmaplus-intrinsics.ll +++ b/llvm/test/CodeGen/PowerPC/mmaplus-intrinsics.ll @@ -31,10 +31,10 @@ define void @ass_acc(ptr %ptr, <16 x i8> %vc) { ; CHECK-NEXT: vmr v3, v2 ; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp34, vsp34, 0 ; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 -; CHECK-NEXT: stxv v4, 48(r3) -; CHECK-NEXT: stxv v5, 32(r3) -; CHECK-NEXT: stxv v2, 16(r3) -; CHECK-NEXT: stxv v3, 0(r3) +; CHECK-NEXT: stxv v2, 48(r3) +; CHECK-NEXT: stxv v3, 32(r3) +; CHECK-NEXT: stxv v4, 16(r3) +; CHECK-NEXT: stxv v5, 0(r3) ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: ass_acc: @@ -55,7 +55,7 @@ define void @ass_acc(ptr %ptr, <16 x i8> %vc) { ; CHECK-O0-NEXT: vmr v3, v4 ; CHECK-O0-NEXT: vmr v2, v4 ; CHECK-O0-NEXT: dmxxinstfdmr512 wacc0, vsp34, vsp34, 0 -; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-O0-NEXT: xxlor vs0, v4, v4 ; CHECK-O0-NEXT: stxv vs0, 48(r3) ; CHECK-O0-NEXT: xxlor vs0, v5, v5 @@ -121,10 +121,10 @@ define void @ld_st_xxmtacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-NEXT: lxv v4, 48(r3) ; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 -; CHECK-NEXT: stxv v4, 48(r7) -; CHECK-NEXT: stxv v5, 32(r7) -; CHECK-NEXT: stxv v2, 16(r7) -; CHECK-NEXT: stxv v3, 0(r7) +; CHECK-NEXT: stxv v2, 48(r7) +; CHECK-NEXT: stxv v3, 32(r7) +; CHECK-NEXT: stxv v4, 16(r7) +; CHECK-NEXT: stxv v5, 0(r7) ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: ld_st_xxmtacc: @@ -154,7 +154,7 @@ define void @ld_st_xxmtacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-O0-NEXT: lxv vs0, 48(r3) ; CHECK-O0-NEXT: xxlor v2, vs0, vs0 ; CHECK-O0-NEXT: dmxxinstfdmr512 wacc0, vsp34, vsp36, 0 -; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-O0-NEXT: xxlor vs0, v4, v4 ; CHECK-O0-NEXT: stxv vs0, 48(r7) ; CHECK-O0-NEXT: xxlor vs0, v5, v5 @@ -236,10 +236,10 @@ define void @ld_op_st_xxmtacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp32, vsp36, 0 ; CHECK-NEXT: xvi4ger8pp wacc0, v2, v2 ; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 -; CHECK-NEXT: stxv v4, 48(r7) -; CHECK-NEXT: stxv v5, 32(r7) -; CHECK-NEXT: stxv v2, 16(r7) -; CHECK-NEXT: stxv v3, 0(r7) +; CHECK-NEXT: stxv v2, 48(r7) +; CHECK-NEXT: stxv v3, 32(r7) +; CHECK-NEXT: stxv v4, 16(r7) +; CHECK-NEXT: stxv v5, 0(r7) ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: ld_op_st_xxmtacc: @@ -271,7 +271,7 @@ define void @ld_op_st_xxmtacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-O0-NEXT: xxlor v4, vs0, vs0 ; CHECK-O0-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp32, 0 ; CHECK-O0-NEXT: xvi4ger8pp wacc0, v2, v2 -; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-O0-NEXT: xxlor vs0, v4, v4 ; CHECK-O0-NEXT: stxv vs0, 48(r7) ; CHECK-O0-NEXT: xxlor vs0, v5, v5 @@ -356,14 +356,14 @@ define void @ld_st_xxmfacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-NEXT: lxv v4, 48(r3) ; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 -; CHECK-NEXT: stxv v4, 48(r3) -; CHECK-NEXT: stxv v5, 32(r3) -; CHECK-NEXT: stxv v2, 16(r3) -; CHECK-NEXT: stxv v3, 0(r3) -; CHECK-NEXT: stxv v4, 48(r7) -; CHECK-NEXT: stxv v5, 32(r7) -; CHECK-NEXT: stxv v2, 16(r7) -; CHECK-NEXT: stxv v3, 0(r7) +; CHECK-NEXT: stxv v2, 48(r3) +; CHECK-NEXT: stxv v3, 32(r3) +; CHECK-NEXT: stxv v4, 16(r3) +; CHECK-NEXT: stxv v5, 0(r3) +; CHECK-NEXT: stxv v2, 48(r7) +; CHECK-NEXT: stxv v3, 32(r7) +; CHECK-NEXT: stxv v4, 16(r7) +; CHECK-NEXT: stxv v5, 0(r7) ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: ld_st_xxmfacc: @@ -397,7 +397,7 @@ define void @ld_st_xxmfacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-O0-NEXT: lxv vs0, 48(r3) ; CHECK-O0-NEXT: xxlor v2, vs0, vs0 ; CHECK-O0-NEXT: dmxxinstfdmr512 wacc0, vsp34, vsp36, 0 -; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-O0-NEXT: xxlor vs3, v4, v4 ; CHECK-O0-NEXT: stxv vs3, 48(r3) ; CHECK-O0-NEXT: xxlor vs2, v5, v5 @@ -496,10 +496,10 @@ define void @ld_op_st_xxmfacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp32, vsp36, 0 ; CHECK-NEXT: xvi4ger8pp wacc0, v2, v2 ; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 -; CHECK-NEXT: stxv v4, 48(r7) -; CHECK-NEXT: stxv v5, 32(r7) -; CHECK-NEXT: stxv v2, 16(r7) -; CHECK-NEXT: stxv v3, 0(r7) +; CHECK-NEXT: stxv v2, 48(r7) +; CHECK-NEXT: stxv v3, 32(r7) +; CHECK-NEXT: stxv v4, 16(r7) +; CHECK-NEXT: stxv v5, 0(r7) ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: ld_op_st_xxmfacc: @@ -531,7 +531,7 @@ define void @ld_op_st_xxmfacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-O0-NEXT: xxlor v4, vs0, vs0 ; CHECK-O0-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp32, 0 ; CHECK-O0-NEXT: xvi4ger8pp wacc0, v2, v2 -; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-O0-NEXT: xxlor vs0, v4, v4 ; CHECK-O0-NEXT: stxv vs0, 48(r7) ; CHECK-O0-NEXT: xxlor vs0, v5, v5 @@ -621,10 +621,10 @@ define void @cmplx_xxmacc(ptr %ptr1, ptr %ptr2, <16 x i8> %vc1, <16 x i8> %vc2) ; CHECK-NEXT: xvf64gerpp wacc0, vsp34, v5 ; CHECK-NEXT: xvf64gerpp wacc0, vsp36, v4 ; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 -; CHECK-NEXT: stxv v4, 48(r3) -; CHECK-NEXT: stxv v5, 32(r3) -; CHECK-NEXT: stxv v2, 16(r3) -; CHECK-NEXT: stxv v3, 0(r3) +; CHECK-NEXT: stxv v2, 48(r3) +; CHECK-NEXT: stxv v3, 32(r3) +; CHECK-NEXT: stxv v4, 16(r3) +; CHECK-NEXT: stxv v5, 0(r3) ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: cmplx_xxmacc: @@ -673,7 +673,7 @@ define void @cmplx_xxmacc(ptr %ptr1, ptr %ptr2, <16 x i8> %vc1, <16 x i8> %vc2) ; CHECK-O0-NEXT: xvf64gerpp wacc0, vsp32, vs0 ; CHECK-O0-NEXT: xxlor vs0, v4, v4 ; CHECK-O0-NEXT: xvf64gerpp wacc0, vsp34, vs0 -; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-O0-NEXT: xxlor vs0, v4, v4 ; CHECK-O0-NEXT: stxv vs0, 48(r3) ; CHECK-O0-NEXT: xxlor vs0, v5, v5 @@ -783,10 +783,10 @@ define void @int_xxsetaccz(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xxsetaccz wacc0 ; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 -; CHECK-NEXT: stxv v4, 48(r3) -; CHECK-NEXT: stxv v5, 32(r3) -; CHECK-NEXT: stxv v2, 16(r3) -; CHECK-NEXT: stxv v3, 0(r3) +; CHECK-NEXT: stxv v2, 48(r3) +; CHECK-NEXT: stxv v3, 32(r3) +; CHECK-NEXT: stxv v4, 16(r3) +; CHECK-NEXT: stxv v5, 0(r3) ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: int_xxsetaccz: @@ -802,7 +802,7 @@ define void @int_xxsetaccz(ptr %ptr) { ; CHECK-O0-LABEL: int_xxsetaccz: ; CHECK-O0: # %bb.0: # %entry ; CHECK-O0-NEXT: xxsetaccz wacc0 -; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-O0-NEXT: xxlor vs0, v4, v4 ; CHECK-O0-NEXT: stxv vs0, 48(r3) ; CHECK-O0-NEXT: xxlor vs0, v5, v5 @@ -946,14 +946,14 @@ define void @testcse(ptr %res, <16 x i8> %vc) { ; CHECK-NEXT: xxsetaccz wacc0 ; CHECK-NEXT: xvf32gerpp wacc0, v2, v2 ; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 -; CHECK-NEXT: stxv v4, 48(r3) -; CHECK-NEXT: stxv v5, 32(r3) -; CHECK-NEXT: stxv v2, 16(r3) -; CHECK-NEXT: stxv v3, 0(r3) -; CHECK-NEXT: stxv v4, 112(r3) -; CHECK-NEXT: stxv v5, 96(r3) -; CHECK-NEXT: stxv v2, 80(r3) -; CHECK-NEXT: stxv v3, 64(r3) +; CHECK-NEXT: stxv v2, 48(r3) +; CHECK-NEXT: stxv v3, 32(r3) +; CHECK-NEXT: stxv v4, 16(r3) +; CHECK-NEXT: stxv v5, 0(r3) +; CHECK-NEXT: stxv v2, 112(r3) +; CHECK-NEXT: stxv v3, 96(r3) +; CHECK-NEXT: stxv v4, 80(r3) +; CHECK-NEXT: stxv v5, 64(r3) ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: testcse: @@ -975,7 +975,7 @@ define void @testcse(ptr %res, <16 x i8> %vc) { ; CHECK-O0: # %bb.0: # %entry ; CHECK-O0-NEXT: xxsetaccz wacc0 ; CHECK-O0-NEXT: xvf32gerpp wacc0, v2, v2 -; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-O0-NEXT: xxlor vs3, v4, v4 ; CHECK-O0-NEXT: stxv vs3, 48(r3) ; CHECK-O0-NEXT: xxlor vs2, v5, v5 @@ -1065,10 +1065,10 @@ define void @test_ldst_1(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, p ; CHECK-NEXT: plxvp vsp36, 8(r4), 0 ; CHECK-NEXT: pmxvf64gernn wacc0, vsp36, v2, 0, 0 ; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 -; CHECK-NEXT: stxv v4, 48(r7) -; CHECK-NEXT: stxv v5, 32(r7) -; CHECK-NEXT: stxv v2, 16(r7) -; CHECK-NEXT: stxv v3, 0(r7) +; CHECK-NEXT: stxv v2, 48(r7) +; CHECK-NEXT: stxv v3, 32(r7) +; CHECK-NEXT: stxv v4, 16(r7) +; CHECK-NEXT: stxv v5, 0(r7) ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test_ldst_1: @@ -1104,7 +1104,7 @@ define void @test_ldst_1(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, p ; CHECK-O0-NEXT: plxvp vsp34, 8(r4), 0 ; CHECK-O0-NEXT: xxlor vs0, v4, v4 ; CHECK-O0-NEXT: pmxvf64gernn wacc0, vsp34, vs0, 0, 0 -; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-O0-NEXT: xxlor vs0, v4, v4 ; CHECK-O0-NEXT: stxv vs0, 48(r7) ; CHECK-O0-NEXT: xxlor vs0, v5, v5 diff --git a/llvm/test/CodeGen/PowerPC/v1024ls.ll b/llvm/test/CodeGen/PowerPC/v1024ls.ll new file mode 100644 index 0000000000000..97668009cb0d7 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/v1024ls.ll @@ -0,0 +1,65 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -disable-auto-paired-vec-st=false \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -disable-auto-paired-vec-st=false \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE + +define void @v1024ls(ptr nocapture readonly %vqp, ptr nocapture %resp) { +; CHECK-LABEL: v1024ls: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxv v3, 0(r3) +; CHECK-NEXT: lxv v5, 32(r3) +; CHECK-NEXT: lxv v2, 16(r3) +; CHECK-NEXT: lxv v4, 48(r3) +; CHECK-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1 +; CHECK-NEXT: lxv v3, 64(r3) +; CHECK-NEXT: lxv v5, 96(r3) +; CHECK-NEXT: lxv v2, 80(r3) +; CHECK-NEXT: lxv v4, 112(r3) +; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-NEXT: stxv v2, 112(r4) +; CHECK-NEXT: stxv v3, 96(r4) +; CHECK-NEXT: stxv v4, 80(r4) +; CHECK-NEXT: stxv v5, 64(r4) +; CHECK-NEXT: dmxxextfdmr512 wacc_hi0, vsp34, vsp36, 1 +; CHECK-NEXT: stxv v2, 48(r4) +; CHECK-NEXT: stxv v3, 32(r4) +; CHECK-NEXT: stxv v4, 16(r4) +; CHECK-NEXT: stxv v5, 0(r4) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: v1024ls: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lxv v3, 112(r3) +; CHECK-BE-NEXT: lxv v5, 80(r3) +; CHECK-BE-NEXT: lxv v2, 96(r3) +; CHECK-BE-NEXT: lxv v4, 64(r3) +; CHECK-BE-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1 +; CHECK-BE-NEXT: lxv v3, 48(r3) +; CHECK-BE-NEXT: lxv v5, 16(r3) +; CHECK-BE-NEXT: lxv v2, 32(r3) +; CHECK-BE-NEXT: lxv v4, 0(r3) +; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-BE-NEXT: dmxxextfdmr512 wacc_hi0, vsp34, vsp36, 1 +; CHECK-BE-NEXT: stxv v5, 112(r4) +; CHECK-BE-NEXT: stxv v4, 96(r4) +; CHECK-BE-NEXT: stxv v3, 80(r4) +; CHECK-BE-NEXT: stxv v2, 64(r4) +; CHECK-BE-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-BE-NEXT: stxv v5, 48(r4) +; CHECK-BE-NEXT: stxv v4, 32(r4) +; CHECK-BE-NEXT: stxv v3, 16(r4) +; CHECK-BE-NEXT: stxv v2, 0(r4) +; CHECK-BE-NEXT: blr +entry: + %0 = load <1024 x i1>, ptr %vqp, align 64 + store <1024 x i1> %0, ptr %resp, align 64 + ret void +} + +declare <1024 x i1> @llvm.ppc.mma.dmsetdmrz() From f4ca19ec2d33f763ca2eb31f654d32c105c06983 Mon Sep 17 00:00:00 2001 From: Roland Froese Date: Thu, 20 Feb 2025 23:19:31 +0000 Subject: [PATCH 2/3] separate 1024 code --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 208 ++++++++++++------ llvm/lib/Target/PowerPC/PPCISelLowering.h | 2 + .../test/CodeGen/PowerPC/mmaplus-acc-spill.ll | 8 +- .../CodeGen/PowerPC/mmaplus-intrinsics.ll | 106 ++++----- llvm/test/CodeGen/PowerPC/v1024ls.ll | 50 ++--- 5 files changed, 213 insertions(+), 161 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 300fa716297bd..a3d35cdb1f97a 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -11763,6 +11763,64 @@ SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, return Op; } +SDValue PPCTargetLowering::LowerDMFVectorLoad(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + LoadSDNode *LN = cast(Op.getNode()); + SDValue LoadChain = LN->getChain(); + SDValue BasePtr = LN->getBasePtr(); + EVT VT = Op.getValueType(); + + // Type v1024i1 is used for Dense Math dmr registers. + assert(VT == MVT::v1024i1 && "Unsupported type."); + assert((Subtarget.hasMMA() && Subtarget.isISAFuture()) && + "Dense Math support required."); + assert(Subtarget.pairedVectorMemops() && "Vector pair support required."); + + SmallVector Loads; + SmallVector LoadChains; + SDValue IntrinID = DAG.getConstant(Intrinsic::ppc_vsx_lxvp, dl, MVT::i32); + SDValue LoadOps[] = {LoadChain, IntrinID, BasePtr}; + MachineMemOperand *MMO = LN->getMemOperand(); + unsigned NumVecs = VT.getSizeInBits() / 256; + for (unsigned Idx = 0; Idx < NumVecs; ++Idx) { + MachineMemOperand *NewMMO = + DAG.getMachineFunction().getMachineMemOperand(MMO, Idx * 32, 32); + if (Idx > 0) { + BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, + DAG.getConstant(32, dl, BasePtr.getValueType())); + LoadOps[2] = BasePtr; + } + SDValue Ld = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, + DAG.getVTList(MVT::v256i1, MVT::Other), + LoadOps, MVT::v256i1, NewMMO); + LoadChains.push_back(Ld.getValue(1)); + Loads.push_back(Ld); + } + + if (Subtarget.isLittleEndian()) { + std::reverse(Loads.begin(), Loads.end()); + std::reverse(LoadChains.begin(), LoadChains.end()); + } + + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); + SDValue Lo(DAG.getMachineNode(PPC::DMXXINSTFDMR512, dl, MVT::v512i1, Loads[0], + Loads[1]), + 0); + SDValue LoSub = DAG.getTargetConstant(PPC::sub_wacc_lo, dl, MVT::i32); + SDValue Hi(DAG.getMachineNode(PPC::DMXXINSTFDMR512_HI, dl, MVT::v512i1, + Loads[2], Loads[3]), + 0); + SDValue HiSub = DAG.getTargetConstant(PPC::sub_wacc_hi, dl, MVT::i32); + SDValue RC = DAG.getTargetConstant(PPC::DMRRCRegClassID, dl, MVT::i32); + const SDValue Ops[] = {RC, Lo, LoSub, Hi, HiSub}; + SDValue Value = + SDValue(DAG.getMachineNode(PPC::REG_SEQUENCE, dl, MVT::v1024i1, Ops), 0); + + SDValue RetOps[] = {Value, TF}; + return DAG.getMergeValues(RetOps, dl); +} + SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -11771,12 +11829,11 @@ SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op, SDValue BasePtr = LN->getBasePtr(); EVT VT = Op.getValueType(); - if (VT != MVT::v256i1 && VT != MVT::v512i1 && VT != MVT::v1024i1) - return Op; + if (VT == MVT::v1024i1) + return LowerDMFVectorLoad(Op, DAG); - // Used for dense math registers. - assert((VT != MVT::v1024i1 || Subtarget.isISAFuture()) && - "Type unsupported for this processor"); + if (VT != MVT::v256i1 && VT != MVT::v512i1) + return Op; // Type v256i1 is used for pairs and v512i1 is used for accumulators. // Here we create 2 or 4 v16i8 loads to load the pair or accumulator value in @@ -11805,40 +11862,76 @@ SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op, std::reverse(LoadChains.begin(), LoadChains.end()); } SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); - SDValue Value; - if (VT == MVT::v1024i1) { - SmallVector Pairs; - SDValue Vsx0Idx = DAG.getTargetConstant(PPC::sub_vsx0, dl, MVT::i32); - SDValue Vsx1Idx = DAG.getTargetConstant(PPC::sub_vsx1, dl, MVT::i32); - SDValue VSRpRC = DAG.getTargetConstant(PPC::VSRpRCRegClassID, dl, MVT::i32); - NumVecs >>= 1; - for (unsigned Idx = 0; Idx < NumVecs; ++Idx) { - const SDValue Ops[] = {VSRpRC, Loads[Idx * 2], Vsx0Idx, - Loads[Idx * 2 + 1], Vsx1Idx}; - Pairs.push_back(SDValue( - DAG.getMachineNode(PPC::REG_SEQUENCE, dl, MVT::v256i1, Ops), 0)); - } - SDValue Lo(DAG.getMachineNode(PPC::DMXXINSTFDMR512, dl, MVT::v512i1, - Pairs[0], Pairs[1]), - 0); - SDValue LoSub = DAG.getTargetConstant(PPC::sub_wacc_lo, dl, MVT::i32); - SDValue Hi(DAG.getMachineNode(PPC::DMXXINSTFDMR512_HI, dl, MVT::v512i1, - Pairs[2], Pairs[3]), - 0); - SDValue HiSub = DAG.getTargetConstant(PPC::sub_wacc_hi, dl, MVT::i32); - SDValue RC = DAG.getTargetConstant(PPC::DMRRCRegClassID, dl, MVT::i32); - const SDValue Ops[] = {RC, Lo, LoSub, Hi, HiSub}; - Value = SDValue( - DAG.getMachineNode(PPC::REG_SEQUENCE, dl, MVT::v1024i1, Ops), 0); - } else { - Value = - DAG.getNode(VT == MVT::v512i1 ? PPCISD::ACC_BUILD : PPCISD::PAIR_BUILD, - dl, VT, Loads); - } + SDValue Value = + DAG.getNode(VT == MVT::v512i1 ? PPCISD::ACC_BUILD : PPCISD::PAIR_BUILD, + dl, VT, Loads); SDValue RetOps[] = {Value, TF}; return DAG.getMergeValues(RetOps, dl); } +SDValue PPCTargetLowering::LowerDMFVectorStore(SDValue Op, + SelectionDAG &DAG) const { + + SDLoc dl(Op); + StoreSDNode *SN = cast(Op.getNode()); + SDValue StoreChain = SN->getChain(); + SDValue BasePtr = SN->getBasePtr(); + SmallVector Values; + SmallVector Stores; + EVT VT = SN->getValue().getValueType(); + + // Type v1024i1 is used for Dense Math dmr registers. + assert(VT == MVT::v1024i1 && "Unsupported type."); + assert((Subtarget.hasMMA() && Subtarget.isISAFuture()) && + "Dense Math support required."); + assert(Subtarget.pairedVectorMemops() && "Vector pair support required."); + + SDValue Lo( + DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::v512i1, + Op.getOperand(1), + DAG.getTargetConstant(PPC::sub_wacc_lo, dl, MVT::i32)), + 0); + SDValue Hi( + DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::v512i1, + Op.getOperand(1), + DAG.getTargetConstant(PPC::sub_wacc_hi, dl, MVT::i32)), + 0); + EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1}; + MachineSDNode *ExtNode = + DAG.getMachineNode(PPC::DMXXEXTFDMR512, dl, ReturnTypes, Lo); + Values.push_back(SDValue(ExtNode, 0)); + Values.push_back(SDValue(ExtNode, 1)); + ExtNode = DAG.getMachineNode(PPC::DMXXEXTFDMR512_HI, dl, ReturnTypes, Hi); + Values.push_back(SDValue(ExtNode, 0)); + Values.push_back(SDValue(ExtNode, 1)); + + if (Subtarget.isLittleEndian()) + std::reverse(Values.begin(), Values.end()); + + SDVTList Tys = DAG.getVTList(MVT::Other); + SmallVector Ops{ + StoreChain, DAG.getConstant(Intrinsic::ppc_vsx_stxvp, dl, MVT::i32), + Values[0], BasePtr}; + MachineMemOperand *MMO = SN->getMemOperand(); + unsigned NumVecs = VT.getSizeInBits() / 256; + for (unsigned Idx = 0; Idx < NumVecs; ++Idx) { + MachineMemOperand *NewMMO = + DAG.getMachineFunction().getMachineMemOperand(MMO, Idx * 32, 32); + if (Idx > 0) { + BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, + DAG.getConstant(32, dl, BasePtr.getValueType())); + Ops[3] = BasePtr; + } + Ops[2] = Values[Idx]; + SDValue St = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, dl, Tys, Ops, + MVT::v256i1, NewMMO); + Stores.push_back(St); + } + + SDValue TF = DAG.getTokenFactor(dl, Stores); + return TF; +} + SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -11846,16 +11939,14 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, SDValue StoreChain = SN->getChain(); SDValue BasePtr = SN->getBasePtr(); SDValue Value = SN->getValue(); + SDValue Value2 = SN->getValue(); EVT StoreVT = Value.getValueType(); - SmallVector ValueVec; - if (StoreVT != MVT::v256i1 && StoreVT != MVT::v512i1 && - StoreVT != MVT::v1024i1) - return Op; + if (StoreVT == MVT::v1024i1) + return LowerDMFVectorStore(Op, DAG); - // Used for dense math registers. - assert((StoreVT != MVT::v1024i1 || Subtarget.isISAFuture()) && - "Type unsupported for this processor"); + if (StoreVT != MVT::v256i1 && StoreVT != MVT::v512i1) + return Op; // Type v256i1 is used for pairs and v512i1 is used for accumulators. // Here we create 2 or 4 v16i8 stores to store the pair or accumulator @@ -11873,43 +11964,20 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, MachineSDNode *ExtNode = DAG.getMachineNode( PPC::DMXXEXTFDMR512, dl, ReturnTypes, Op.getOperand(1)); - ValueVec.push_back(SDValue(ExtNode, 0)); - ValueVec.push_back(SDValue(ExtNode, 1)); + Value = SDValue(ExtNode, 0); + Value2 = SDValue(ExtNode, 1); } else Value = DAG.getNode(PPCISD::XXMFACC, dl, MVT::v512i1, Value); NumVecs = 4; - - } else if (StoreVT == MVT::v1024i1) { - SDValue Lo(DAG.getMachineNode( - TargetOpcode::EXTRACT_SUBREG, dl, MVT::v512i1, - Op.getOperand(1), - DAG.getTargetConstant(PPC::sub_wacc_lo, dl, MVT::i32)), - 0); - SDValue Hi(DAG.getMachineNode( - TargetOpcode::EXTRACT_SUBREG, dl, MVT::v512i1, - Op.getOperand(1), - DAG.getTargetConstant(PPC::sub_wacc_hi, dl, MVT::i32)), - 0); - EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1}; - MachineSDNode *ExtNode = - DAG.getMachineNode(PPC::DMXXEXTFDMR512, dl, ReturnTypes, Lo); - ValueVec.push_back(SDValue(ExtNode, 0)); - ValueVec.push_back(SDValue(ExtNode, 1)); - ExtNode = DAG.getMachineNode(PPC::DMXXEXTFDMR512_HI, dl, ReturnTypes, Hi); - ValueVec.push_back(SDValue(ExtNode, 0)); - ValueVec.push_back(SDValue(ExtNode, 1)); - NumVecs = 8; } for (unsigned Idx = 0; Idx < NumVecs; ++Idx) { unsigned VecNum = Subtarget.isLittleEndian() ? NumVecs - 1 - Idx : Idx; SDValue Elt; if (Subtarget.isISAFuture()) { VecNum = Subtarget.isLittleEndian() ? 1 - (Idx % 2) : (Idx % 2); - unsigned Pairx = - Subtarget.isLittleEndian() ? (NumVecs - Idx - 1) / 2 : Idx / 2; - Elt = DAG.getNode( - PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, ValueVec[Pairx], - DAG.getConstant(VecNum, dl, getPointerTy(DAG.getDataLayout()))); + Elt = DAG.getNode(PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, + Idx > 1 ? Value2 : Value, + DAG.getConstant(VecNum, dl, getPointerTy(DAG.getDataLayout()))); } else Elt = DAG.getNode(PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, Value, DAG.getConstant(VecNum, dl, getPointerTy(DAG.getDataLayout()))); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 514329bbe92d7..1f22aa16a89be 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1344,6 +1344,8 @@ namespace llvm { SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDMFVectorLoad(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDMFVectorStore(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, diff --git a/llvm/test/CodeGen/PowerPC/mmaplus-acc-spill.ll b/llvm/test/CodeGen/PowerPC/mmaplus-acc-spill.ll index c8ead89f96d66..5ca8c7b02cab4 100644 --- a/llvm/test/CodeGen/PowerPC/mmaplus-acc-spill.ll +++ b/llvm/test/CodeGen/PowerPC/mmaplus-acc-spill.ll @@ -46,10 +46,10 @@ define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i ; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp34, vsp36, 0 ; CHECK-NEXT: xvf16ger2pp wacc0, v28, v30 ; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 -; CHECK-NEXT: stxv v2, 48(r30) -; CHECK-NEXT: stxv v3, 32(r30) -; CHECK-NEXT: stxv v4, 16(r30) -; CHECK-NEXT: stxv v5, 0(r30) +; CHECK-NEXT: stxv v4, 48(r30) +; CHECK-NEXT: stxv v5, 32(r30) +; CHECK-NEXT: stxv v2, 16(r30) +; CHECK-NEXT: stxv v3, 0(r30) ; CHECK-NEXT: lxv v31, 144(r1) # 16-byte Folded Reload ; CHECK-NEXT: lxv v30, 128(r1) # 16-byte Folded Reload ; CHECK-NEXT: lxv v29, 112(r1) # 16-byte Folded Reload diff --git a/llvm/test/CodeGen/PowerPC/mmaplus-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mmaplus-intrinsics.ll index b3e4392b8d0e3..158ec7a3427c8 100644 --- a/llvm/test/CodeGen/PowerPC/mmaplus-intrinsics.ll +++ b/llvm/test/CodeGen/PowerPC/mmaplus-intrinsics.ll @@ -31,10 +31,10 @@ define void @ass_acc(ptr %ptr, <16 x i8> %vc) { ; CHECK-NEXT: vmr v3, v2 ; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp34, vsp34, 0 ; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 -; CHECK-NEXT: stxv v2, 48(r3) -; CHECK-NEXT: stxv v3, 32(r3) -; CHECK-NEXT: stxv v4, 16(r3) -; CHECK-NEXT: stxv v5, 0(r3) +; CHECK-NEXT: stxv v4, 48(r3) +; CHECK-NEXT: stxv v5, 32(r3) +; CHECK-NEXT: stxv v2, 16(r3) +; CHECK-NEXT: stxv v3, 0(r3) ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: ass_acc: @@ -55,7 +55,7 @@ define void @ass_acc(ptr %ptr, <16 x i8> %vc) { ; CHECK-O0-NEXT: vmr v3, v4 ; CHECK-O0-NEXT: vmr v2, v4 ; CHECK-O0-NEXT: dmxxinstfdmr512 wacc0, vsp34, vsp34, 0 -; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 ; CHECK-O0-NEXT: xxlor vs0, v4, v4 ; CHECK-O0-NEXT: stxv vs0, 48(r3) ; CHECK-O0-NEXT: xxlor vs0, v5, v5 @@ -121,10 +121,10 @@ define void @ld_st_xxmtacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-NEXT: lxv v4, 48(r3) ; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 -; CHECK-NEXT: stxv v2, 48(r7) -; CHECK-NEXT: stxv v3, 32(r7) -; CHECK-NEXT: stxv v4, 16(r7) -; CHECK-NEXT: stxv v5, 0(r7) +; CHECK-NEXT: stxv v4, 48(r7) +; CHECK-NEXT: stxv v5, 32(r7) +; CHECK-NEXT: stxv v2, 16(r7) +; CHECK-NEXT: stxv v3, 0(r7) ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: ld_st_xxmtacc: @@ -154,7 +154,7 @@ define void @ld_st_xxmtacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-O0-NEXT: lxv vs0, 48(r3) ; CHECK-O0-NEXT: xxlor v2, vs0, vs0 ; CHECK-O0-NEXT: dmxxinstfdmr512 wacc0, vsp34, vsp36, 0 -; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 ; CHECK-O0-NEXT: xxlor vs0, v4, v4 ; CHECK-O0-NEXT: stxv vs0, 48(r7) ; CHECK-O0-NEXT: xxlor vs0, v5, v5 @@ -236,10 +236,10 @@ define void @ld_op_st_xxmtacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp32, vsp36, 0 ; CHECK-NEXT: xvi4ger8pp wacc0, v2, v2 ; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 -; CHECK-NEXT: stxv v2, 48(r7) -; CHECK-NEXT: stxv v3, 32(r7) -; CHECK-NEXT: stxv v4, 16(r7) -; CHECK-NEXT: stxv v5, 0(r7) +; CHECK-NEXT: stxv v4, 48(r7) +; CHECK-NEXT: stxv v5, 32(r7) +; CHECK-NEXT: stxv v2, 16(r7) +; CHECK-NEXT: stxv v3, 0(r7) ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: ld_op_st_xxmtacc: @@ -271,7 +271,7 @@ define void @ld_op_st_xxmtacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-O0-NEXT: xxlor v4, vs0, vs0 ; CHECK-O0-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp32, 0 ; CHECK-O0-NEXT: xvi4ger8pp wacc0, v2, v2 -; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 ; CHECK-O0-NEXT: xxlor vs0, v4, v4 ; CHECK-O0-NEXT: stxv vs0, 48(r7) ; CHECK-O0-NEXT: xxlor vs0, v5, v5 @@ -356,14 +356,14 @@ define void @ld_st_xxmfacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-NEXT: lxv v4, 48(r3) ; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 -; CHECK-NEXT: stxv v2, 48(r3) -; CHECK-NEXT: stxv v3, 32(r3) -; CHECK-NEXT: stxv v4, 16(r3) -; CHECK-NEXT: stxv v5, 0(r3) -; CHECK-NEXT: stxv v2, 48(r7) -; CHECK-NEXT: stxv v3, 32(r7) -; CHECK-NEXT: stxv v4, 16(r7) -; CHECK-NEXT: stxv v5, 0(r7) +; CHECK-NEXT: stxv v4, 48(r3) +; CHECK-NEXT: stxv v5, 32(r3) +; CHECK-NEXT: stxv v2, 16(r3) +; CHECK-NEXT: stxv v3, 0(r3) +; CHECK-NEXT: stxv v4, 48(r7) +; CHECK-NEXT: stxv v5, 32(r7) +; CHECK-NEXT: stxv v2, 16(r7) +; CHECK-NEXT: stxv v3, 0(r7) ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: ld_st_xxmfacc: @@ -397,7 +397,7 @@ define void @ld_st_xxmfacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-O0-NEXT: lxv vs0, 48(r3) ; CHECK-O0-NEXT: xxlor v2, vs0, vs0 ; CHECK-O0-NEXT: dmxxinstfdmr512 wacc0, vsp34, vsp36, 0 -; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 ; CHECK-O0-NEXT: xxlor vs3, v4, v4 ; CHECK-O0-NEXT: stxv vs3, 48(r3) ; CHECK-O0-NEXT: xxlor vs2, v5, v5 @@ -496,10 +496,10 @@ define void @ld_op_st_xxmfacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp32, vsp36, 0 ; CHECK-NEXT: xvi4ger8pp wacc0, v2, v2 ; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 -; CHECK-NEXT: stxv v2, 48(r7) -; CHECK-NEXT: stxv v3, 32(r7) -; CHECK-NEXT: stxv v4, 16(r7) -; CHECK-NEXT: stxv v5, 0(r7) +; CHECK-NEXT: stxv v4, 48(r7) +; CHECK-NEXT: stxv v5, 32(r7) +; CHECK-NEXT: stxv v2, 16(r7) +; CHECK-NEXT: stxv v3, 0(r7) ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: ld_op_st_xxmfacc: @@ -531,7 +531,7 @@ define void @ld_op_st_xxmfacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-O0-NEXT: xxlor v4, vs0, vs0 ; CHECK-O0-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp32, 0 ; CHECK-O0-NEXT: xvi4ger8pp wacc0, v2, v2 -; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 ; CHECK-O0-NEXT: xxlor vs0, v4, v4 ; CHECK-O0-NEXT: stxv vs0, 48(r7) ; CHECK-O0-NEXT: xxlor vs0, v5, v5 @@ -621,10 +621,10 @@ define void @cmplx_xxmacc(ptr %ptr1, ptr %ptr2, <16 x i8> %vc1, <16 x i8> %vc2) ; CHECK-NEXT: xvf64gerpp wacc0, vsp34, v5 ; CHECK-NEXT: xvf64gerpp wacc0, vsp36, v4 ; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 -; CHECK-NEXT: stxv v2, 48(r3) -; CHECK-NEXT: stxv v3, 32(r3) -; CHECK-NEXT: stxv v4, 16(r3) -; CHECK-NEXT: stxv v5, 0(r3) +; CHECK-NEXT: stxv v4, 48(r3) +; CHECK-NEXT: stxv v5, 32(r3) +; CHECK-NEXT: stxv v2, 16(r3) +; CHECK-NEXT: stxv v3, 0(r3) ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: cmplx_xxmacc: @@ -673,7 +673,7 @@ define void @cmplx_xxmacc(ptr %ptr1, ptr %ptr2, <16 x i8> %vc1, <16 x i8> %vc2) ; CHECK-O0-NEXT: xvf64gerpp wacc0, vsp32, vs0 ; CHECK-O0-NEXT: xxlor vs0, v4, v4 ; CHECK-O0-NEXT: xvf64gerpp wacc0, vsp34, vs0 -; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 ; CHECK-O0-NEXT: xxlor vs0, v4, v4 ; CHECK-O0-NEXT: stxv vs0, 48(r3) ; CHECK-O0-NEXT: xxlor vs0, v5, v5 @@ -783,10 +783,10 @@ define void @int_xxsetaccz(ptr %ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xxsetaccz wacc0 ; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 -; CHECK-NEXT: stxv v2, 48(r3) -; CHECK-NEXT: stxv v3, 32(r3) -; CHECK-NEXT: stxv v4, 16(r3) -; CHECK-NEXT: stxv v5, 0(r3) +; CHECK-NEXT: stxv v4, 48(r3) +; CHECK-NEXT: stxv v5, 32(r3) +; CHECK-NEXT: stxv v2, 16(r3) +; CHECK-NEXT: stxv v3, 0(r3) ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: int_xxsetaccz: @@ -802,7 +802,7 @@ define void @int_xxsetaccz(ptr %ptr) { ; CHECK-O0-LABEL: int_xxsetaccz: ; CHECK-O0: # %bb.0: # %entry ; CHECK-O0-NEXT: xxsetaccz wacc0 -; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 ; CHECK-O0-NEXT: xxlor vs0, v4, v4 ; CHECK-O0-NEXT: stxv vs0, 48(r3) ; CHECK-O0-NEXT: xxlor vs0, v5, v5 @@ -946,14 +946,14 @@ define void @testcse(ptr %res, <16 x i8> %vc) { ; CHECK-NEXT: xxsetaccz wacc0 ; CHECK-NEXT: xvf32gerpp wacc0, v2, v2 ; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 -; CHECK-NEXT: stxv v2, 48(r3) -; CHECK-NEXT: stxv v3, 32(r3) -; CHECK-NEXT: stxv v4, 16(r3) -; CHECK-NEXT: stxv v5, 0(r3) -; CHECK-NEXT: stxv v2, 112(r3) -; CHECK-NEXT: stxv v3, 96(r3) -; CHECK-NEXT: stxv v4, 80(r3) -; CHECK-NEXT: stxv v5, 64(r3) +; CHECK-NEXT: stxv v4, 48(r3) +; CHECK-NEXT: stxv v5, 32(r3) +; CHECK-NEXT: stxv v2, 16(r3) +; CHECK-NEXT: stxv v3, 0(r3) +; CHECK-NEXT: stxv v4, 112(r3) +; CHECK-NEXT: stxv v5, 96(r3) +; CHECK-NEXT: stxv v2, 80(r3) +; CHECK-NEXT: stxv v3, 64(r3) ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: testcse: @@ -975,7 +975,7 @@ define void @testcse(ptr %res, <16 x i8> %vc) { ; CHECK-O0: # %bb.0: # %entry ; CHECK-O0-NEXT: xxsetaccz wacc0 ; CHECK-O0-NEXT: xvf32gerpp wacc0, v2, v2 -; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 ; CHECK-O0-NEXT: xxlor vs3, v4, v4 ; CHECK-O0-NEXT: stxv vs3, 48(r3) ; CHECK-O0-NEXT: xxlor vs2, v5, v5 @@ -1065,10 +1065,10 @@ define void @test_ldst_1(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, p ; CHECK-NEXT: plxvp vsp36, 8(r4), 0 ; CHECK-NEXT: pmxvf64gernn wacc0, vsp36, v2, 0, 0 ; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 -; CHECK-NEXT: stxv v2, 48(r7) -; CHECK-NEXT: stxv v3, 32(r7) -; CHECK-NEXT: stxv v4, 16(r7) -; CHECK-NEXT: stxv v5, 0(r7) +; CHECK-NEXT: stxv v4, 48(r7) +; CHECK-NEXT: stxv v5, 32(r7) +; CHECK-NEXT: stxv v2, 16(r7) +; CHECK-NEXT: stxv v3, 0(r7) ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test_ldst_1: @@ -1104,7 +1104,7 @@ define void @test_ldst_1(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, p ; CHECK-O0-NEXT: plxvp vsp34, 8(r4), 0 ; CHECK-O0-NEXT: xxlor vs0, v4, v4 ; CHECK-O0-NEXT: pmxvf64gernn wacc0, vsp34, vs0, 0, 0 -; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-O0-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 ; CHECK-O0-NEXT: xxlor vs0, v4, v4 ; CHECK-O0-NEXT: stxv vs0, 48(r7) ; CHECK-O0-NEXT: xxlor vs0, v5, v5 diff --git a/llvm/test/CodeGen/PowerPC/v1024ls.ll b/llvm/test/CodeGen/PowerPC/v1024ls.ll index 97668009cb0d7..c7f6911f9ddbc 100644 --- a/llvm/test/CodeGen/PowerPC/v1024ls.ll +++ b/llvm/test/CodeGen/PowerPC/v1024ls.ll @@ -1,60 +1,42 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ -; RUN: -disable-auto-paired-vec-st=false \ ; RUN: -mcpu=future -ppc-asm-full-reg-names \ ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ -; RUN: -disable-auto-paired-vec-st=false \ ; RUN: -mcpu=future -ppc-asm-full-reg-names \ ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE define void @v1024ls(ptr nocapture readonly %vqp, ptr nocapture %resp) { ; CHECK-LABEL: v1024ls: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lxv v3, 0(r3) -; CHECK-NEXT: lxv v5, 32(r3) -; CHECK-NEXT: lxv v2, 16(r3) -; CHECK-NEXT: lxv v4, 48(r3) +; CHECK-NEXT: lxvp vsp34, 0(r3) +; CHECK-NEXT: lxvp vsp36, 32(r3) ; CHECK-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1 -; CHECK-NEXT: lxv v3, 64(r3) -; CHECK-NEXT: lxv v5, 96(r3) -; CHECK-NEXT: lxv v2, 80(r3) -; CHECK-NEXT: lxv v4, 112(r3) +; CHECK-NEXT: lxvp vsp34, 64(r3) +; CHECK-NEXT: lxvp vsp36, 96(r3) ; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 -; CHECK-NEXT: stxv v2, 112(r4) -; CHECK-NEXT: stxv v3, 96(r4) -; CHECK-NEXT: stxv v4, 80(r4) -; CHECK-NEXT: stxv v5, 64(r4) +; CHECK-NEXT: stxvp vsp34, 96(r4) +; CHECK-NEXT: stxvp vsp36, 64(r4) ; CHECK-NEXT: dmxxextfdmr512 wacc_hi0, vsp34, vsp36, 1 -; CHECK-NEXT: stxv v2, 48(r4) -; CHECK-NEXT: stxv v3, 32(r4) -; CHECK-NEXT: stxv v4, 16(r4) -; CHECK-NEXT: stxv v5, 0(r4) +; CHECK-NEXT: stxvp vsp34, 32(r4) +; CHECK-NEXT: stxvp vsp36, 0(r4) ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: v1024ls: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lxv v3, 112(r3) -; CHECK-BE-NEXT: lxv v5, 80(r3) -; CHECK-BE-NEXT: lxv v2, 96(r3) -; CHECK-BE-NEXT: lxv v4, 64(r3) +; CHECK-BE-NEXT: lxvp vsp34, 96(r3) +; CHECK-BE-NEXT: lxvp vsp36, 64(r3) ; CHECK-BE-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1 -; CHECK-BE-NEXT: lxv v3, 48(r3) -; CHECK-BE-NEXT: lxv v5, 16(r3) -; CHECK-BE-NEXT: lxv v2, 32(r3) -; CHECK-BE-NEXT: lxv v4, 0(r3) +; CHECK-BE-NEXT: lxvp vsp34, 32(r3) +; CHECK-BE-NEXT: lxvp vsp36, 0(r3) ; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-BE-NEXT: dmxxextfdmr512 wacc_hi0, vsp34, vsp36, 1 -; CHECK-BE-NEXT: stxv v5, 112(r4) -; CHECK-BE-NEXT: stxv v4, 96(r4) -; CHECK-BE-NEXT: stxv v3, 80(r4) -; CHECK-BE-NEXT: stxv v2, 64(r4) +; CHECK-BE-NEXT: stxvp vsp36, 96(r4) +; CHECK-BE-NEXT: stxvp vsp34, 64(r4) ; CHECK-BE-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0 -; CHECK-BE-NEXT: stxv v5, 48(r4) -; CHECK-BE-NEXT: stxv v4, 32(r4) -; CHECK-BE-NEXT: stxv v3, 16(r4) -; CHECK-BE-NEXT: stxv v2, 0(r4) +; CHECK-BE-NEXT: stxvp vsp36, 32(r4) +; CHECK-BE-NEXT: stxvp vsp34, 0(r4) ; CHECK-BE-NEXT: blr entry: %0 = load <1024 x i1>, ptr %vqp, align 64 From f927abaf83feb003af1deb1dae55c61279991f40 Mon Sep 17 00:00:00 2001 From: Roland Froese Date: Wed, 26 Feb 2025 18:56:59 +0000 Subject: [PATCH 3/3] update check --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index a3d35cdb1f97a..f204988afe20c 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1355,19 +1355,18 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::STORE, MVT::v256i1, Custom); } if (Subtarget.hasMMA()) { - if (Subtarget.isISAFuture()) + if (Subtarget.isISAFuture()) { addRegisterClass(MVT::v512i1, &PPC::WACCRCRegClass); - else + addRegisterClass(MVT::v1024i1, &PPC::DMRRCRegClass); + setOperationAction(ISD::LOAD, MVT::v1024i1, Custom); + setOperationAction(ISD::STORE, MVT::v1024i1, Custom); + } else { addRegisterClass(MVT::v512i1, &PPC::UACCRCRegClass); + } setOperationAction(ISD::LOAD, MVT::v512i1, Custom); setOperationAction(ISD::STORE, MVT::v512i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v512i1, Custom); } - if (Subtarget.isISAFuture()) { - setOperationAction(ISD::LOAD, MVT::v1024i1, Custom); - setOperationAction(ISD::STORE, MVT::v1024i1, Custom); - addRegisterClass(MVT::v1024i1, &PPC::DMRRCRegClass); - } if (Subtarget.has64BitSupport()) setOperationAction(ISD::PREFETCH, MVT::Other, Legal);