llvm
diff --git a/‎llvm/lib/Target/PowerPC/PPCISelLowering.cpp‎
Lines changed: 138 additions & 70 deletions b/‎llvm/lib/Target/PowerPC/PPCISelLowering.cpp‎
Lines changed: 138 additions & 70 deletions
diff --git a/‎llvm/lib/Target/PowerPC/PPCISelLowering.h‎
Lines changed: 2 additions & 0 deletions b/‎llvm/lib/Target/PowerPC/PPCISelLowering.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎llvm/test/CodeGen/PowerPC/mmaplus-acc-spill.ll‎
Lines changed: 4 additions & 4 deletions b/‎llvm/test/CodeGen/PowerPC/mmaplus-acc-spill.ll‎
Lines changed: 4 additions & 4 deletions
@@ -11763,6 +11763,64 @@ SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
   return Op;
 }
 
+SDValue PPCTargetLowering::LowerDMFVectorLoad(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  LoadSDNode *LN = cast<LoadSDNode>(Op.getNode());
+  SDValue LoadChain = LN->getChain();
+  SDValue BasePtr = LN->getBasePtr();
+  EVT VT = Op.getValueType();
+
+  // Type v1024i1 is used for Dense Math dmr registers.
+  assert(VT == MVT::v1024i1 && "Unsupported type.");
+  assert((Subtarget.hasMMA() && Subtarget.isISAFuture()) &&
+         "Dense Math support required.");
+  assert(Subtarget.pairedVectorMemops() && "Vector pair support required.");
+
+  SmallVector<SDValue, 4> Loads;
+  SmallVector<SDValue, 4> LoadChains;
+  SDValue IntrinID = DAG.getConstant(Intrinsic::ppc_vsx_lxvp, dl, MVT::i32);
+  SDValue LoadOps[] = {LoadChain, IntrinID, BasePtr};
+  MachineMemOperand *MMO = LN->getMemOperand();
+  unsigned NumVecs = VT.getSizeInBits() / 256;
+  for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
+    MachineMemOperand *NewMMO =
+        DAG.getMachineFunction().getMachineMemOperand(MMO, Idx * 32, 32);
+    if (Idx > 0) {
+      BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+                            DAG.getConstant(32, dl, BasePtr.getValueType()));
+      LoadOps[2] = BasePtr;
+    }
+    SDValue Ld = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
+                                         DAG.getVTList(MVT::v256i1, MVT::Other),
+                                         LoadOps, MVT::v256i1, NewMMO);
+    LoadChains.push_back(Ld.getValue(1));
+    Loads.push_back(Ld);
+  }
+
+  if (Subtarget.isLittleEndian()) {
+    std::reverse(Loads.begin(), Loads.end());
+    std::reverse(LoadChains.begin(), LoadChains.end());
+  }
+
+  SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
+  SDValue Lo(DAG.getMachineNode(PPC::DMXXINSTFDMR512, dl, MVT::v512i1, Loads[0],
+                                Loads[1]),
+             0);
+  SDValue LoSub = DAG.getTargetConstant(PPC::sub_wacc_lo, dl, MVT::i32);
+  SDValue Hi(DAG.getMachineNode(PPC::DMXXINSTFDMR512_HI, dl, MVT::v512i1,
+                                Loads[2], Loads[3]),
+             0);
+  SDValue HiSub = DAG.getTargetConstant(PPC::sub_wacc_hi, dl, MVT::i32);
+  SDValue RC = DAG.getTargetConstant(PPC::DMRRCRegClassID, dl, MVT::i32);
+  const SDValue Ops[] = {RC, Lo, LoSub, Hi, HiSub};
+  SDValue Value =
+      SDValue(DAG.getMachineNode(PPC::REG_SEQUENCE, dl, MVT::v1024i1, Ops), 0);
+
+  SDValue RetOps[] = {Value, TF};
+  return DAG.getMergeValues(RetOps, dl);
+}
+
 SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDLoc dl(Op);
@@ -11771,12 +11829,11 @@ SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
   SDValue BasePtr = LN->getBasePtr();
   EVT VT = Op.getValueType();
 
-  if (VT != MVT::v256i1 && VT != MVT::v512i1 && VT != MVT::v1024i1)
-    return Op;
+  if (VT == MVT::v1024i1)
+    return LowerDMFVectorLoad(Op, DAG);
 
-  // Used for dense math registers.
-  assert((VT != MVT::v1024i1 || Subtarget.isISAFuture()) &&
-         "Type unsupported for this processor");
+  if (VT != MVT::v256i1 && VT != MVT::v512i1)
+    return Op;
 
   // Type v256i1 is used for pairs and v512i1 is used for accumulators.
   // Here we create 2 or 4 v16i8 loads to load the pair or accumulator value in
@@ -11805,57 +11862,91 @@ SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
     std::reverse(LoadChains.begin(), LoadChains.end());
   }
   SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
-  SDValue Value;
-  if (VT == MVT::v1024i1) {
-    SmallVector<SDValue, 4> Pairs;
-    SDValue Vsx0Idx = DAG.getTargetConstant(PPC::sub_vsx0, dl, MVT::i32);
-    SDValue Vsx1Idx = DAG.getTargetConstant(PPC::sub_vsx1, dl, MVT::i32);
-    SDValue VSRpRC = DAG.getTargetConstant(PPC::VSRpRCRegClassID, dl, MVT::i32);
-    NumVecs >>= 1;
-    for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
-      const SDValue Ops[] = {VSRpRC, Loads[Idx * 2], Vsx0Idx,
-                             Loads[Idx * 2 + 1], Vsx1Idx};
-      Pairs.push_back(SDValue(
-          DAG.getMachineNode(PPC::REG_SEQUENCE, dl, MVT::v256i1, Ops), 0));
-    }
-    SDValue Lo(DAG.getMachineNode(PPC::DMXXINSTFDMR512, dl, MVT::v512i1,
-                                  Pairs[0], Pairs[1]),
-               0);
-    SDValue LoSub = DAG.getTargetConstant(PPC::sub_wacc_lo, dl, MVT::i32);
-    SDValue Hi(DAG.getMachineNode(PPC::DMXXINSTFDMR512_HI, dl, MVT::v512i1,
-                                  Pairs[2], Pairs[3]),
-               0);
-    SDValue HiSub = DAG.getTargetConstant(PPC::sub_wacc_hi, dl, MVT::i32);
-    SDValue RC = DAG.getTargetConstant(PPC::DMRRCRegClassID, dl, MVT::i32);
-    const SDValue Ops[] = {RC, Lo, LoSub, Hi, HiSub};
-    Value = SDValue(
-        DAG.getMachineNode(PPC::REG_SEQUENCE, dl, MVT::v1024i1, Ops), 0);
-  } else {
-    Value =
-        DAG.getNode(VT == MVT::v512i1 ? PPCISD::ACC_BUILD : PPCISD::PAIR_BUILD,
-                    dl, VT, Loads);
-  }
+  SDValue Value =
+      DAG.getNode(VT == MVT::v512i1 ? PPCISD::ACC_BUILD : PPCISD::PAIR_BUILD,
+                  dl, VT, Loads);
   SDValue RetOps[] = {Value, TF};
   return DAG.getMergeValues(RetOps, dl);
 }
 
+SDValue PPCTargetLowering::LowerDMFVectorStore(SDValue Op,
+                                               SelectionDAG &DAG) const {
+
+  SDLoc dl(Op);
+  StoreSDNode *SN = cast<StoreSDNode>(Op.getNode());
+  SDValue StoreChain = SN->getChain();
+  SDValue BasePtr = SN->getBasePtr();
+  SmallVector<SDValue, 4> Values;
+  SmallVector<SDValue, 4> Stores;
+  EVT VT = SN->getValue().getValueType();
+
+  // Type v1024i1 is used for Dense Math dmr registers.
+  assert(VT == MVT::v1024i1 && "Unsupported type.");
+  assert((Subtarget.hasMMA() && Subtarget.isISAFuture()) &&
+         "Dense Math support required.");
+  assert(Subtarget.pairedVectorMemops() && "Vector pair support required.");
+
+  SDValue Lo(
+      DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::v512i1,
+                         Op.getOperand(1),
+                         DAG.getTargetConstant(PPC::sub_wacc_lo, dl, MVT::i32)),
+      0);
+  SDValue Hi(
+      DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::v512i1,
+                         Op.getOperand(1),
+                         DAG.getTargetConstant(PPC::sub_wacc_hi, dl, MVT::i32)),
+      0);
+  EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};
+  MachineSDNode *ExtNode =
+      DAG.getMachineNode(PPC::DMXXEXTFDMR512, dl, ReturnTypes, Lo);
+  Values.push_back(SDValue(ExtNode, 0));
+  Values.push_back(SDValue(ExtNode, 1));
+  ExtNode = DAG.getMachineNode(PPC::DMXXEXTFDMR512_HI, dl, ReturnTypes, Hi);
+  Values.push_back(SDValue(ExtNode, 0));
+  Values.push_back(SDValue(ExtNode, 1));
+
+  if (Subtarget.isLittleEndian())
+    std::reverse(Values.begin(), Values.end());
+
+  SDVTList Tys = DAG.getVTList(MVT::Other);
+  SmallVector<SDValue, 4> Ops{
+      StoreChain, DAG.getConstant(Intrinsic::ppc_vsx_stxvp, dl, MVT::i32),
+      Values[0], BasePtr};
+  MachineMemOperand *MMO = SN->getMemOperand();
+  unsigned NumVecs = VT.getSizeInBits() / 256;
+  for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
+    MachineMemOperand *NewMMO =
+        DAG.getMachineFunction().getMachineMemOperand(MMO, Idx * 32, 32);
+    if (Idx > 0) {
+      BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+                            DAG.getConstant(32, dl, BasePtr.getValueType()));
+      Ops[3] = BasePtr;
+    }
+    Ops[2] = Values[Idx];
+    SDValue St = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, dl, Tys, Ops,
+                                         MVT::v256i1, NewMMO);
+    Stores.push_back(St);
+  }
+
+  SDValue TF = DAG.getTokenFactor(dl, Stores);
+  return TF;
+}
+
 SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
                                             SelectionDAG &DAG) const {
   SDLoc dl(Op);
   StoreSDNode *SN = cast<StoreSDNode>(Op.getNode());
   SDValue StoreChain = SN->getChain();
   SDValue BasePtr = SN->getBasePtr();
   SDValue Value = SN->getValue();
+  SDValue Value2 = SN->getValue();
   EVT StoreVT = Value.getValueType();
-  SmallVector<SDValue, 4> ValueVec;
 
-  if (StoreVT != MVT::v256i1 && StoreVT != MVT::v512i1 &&
-      StoreVT != MVT::v1024i1)
-    return Op;
+  if (StoreVT == MVT::v1024i1)
+    return LowerDMFVectorStore(Op, DAG);
 
-  // Used for dense math registers.
-  assert((StoreVT != MVT::v1024i1 || Subtarget.isISAFuture()) &&
-         "Type unsupported for this processor");
+  if (StoreVT != MVT::v256i1 && StoreVT != MVT::v512i1)
+    return Op;
 
   // Type v256i1 is used for pairs and v512i1 is used for accumulators.
   // Here we create 2 or 4 v16i8 stores to store the pair or accumulator
@@ -11873,43 +11964,20 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
       MachineSDNode *ExtNode = DAG.getMachineNode(
           PPC::DMXXEXTFDMR512, dl, ReturnTypes, Op.getOperand(1));
 
-      ValueVec.push_back(SDValue(ExtNode, 0));
-      ValueVec.push_back(SDValue(ExtNode, 1));
+      Value = SDValue(ExtNode, 0);
+      Value2 = SDValue(ExtNode, 1);
     } else
       Value = DAG.getNode(PPCISD::XXMFACC, dl, MVT::v512i1, Value);
     NumVecs = 4;
-
-  } else if (StoreVT == MVT::v1024i1) {
-    SDValue Lo(DAG.getMachineNode(
-                   TargetOpcode::EXTRACT_SUBREG, dl, MVT::v512i1,
-                   Op.getOperand(1),
-                   DAG.getTargetConstant(PPC::sub_wacc_lo, dl, MVT::i32)),
-               0);
-    SDValue Hi(DAG.getMachineNode(
-                   TargetOpcode::EXTRACT_SUBREG, dl, MVT::v512i1,
-                   Op.getOperand(1),
-                   DAG.getTargetConstant(PPC::sub_wacc_hi, dl, MVT::i32)),
-               0);
-    EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};
-    MachineSDNode *ExtNode =
-        DAG.getMachineNode(PPC::DMXXEXTFDMR512, dl, ReturnTypes, Lo);
-    ValueVec.push_back(SDValue(ExtNode, 0));
-    ValueVec.push_back(SDValue(ExtNode, 1));
-    ExtNode = DAG.getMachineNode(PPC::DMXXEXTFDMR512_HI, dl, ReturnTypes, Hi);
-    ValueVec.push_back(SDValue(ExtNode, 0));
-    ValueVec.push_back(SDValue(ExtNode, 1));
-    NumVecs = 8;
   }
   for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
     unsigned VecNum = Subtarget.isLittleEndian() ? NumVecs - 1 - Idx : Idx;
     SDValue Elt;
     if (Subtarget.isISAFuture()) {
       VecNum = Subtarget.isLittleEndian() ? 1 - (Idx % 2) : (Idx % 2);
-      unsigned Pairx =
-          Subtarget.isLittleEndian() ? (NumVecs - Idx - 1) / 2 : Idx / 2;
-      Elt = DAG.getNode(
-          PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, ValueVec[Pairx],
-          DAG.getConstant(VecNum, dl, getPointerTy(DAG.getDataLayout())));
+      Elt = DAG.getNode(PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8,
+                        Idx > 1 ? Value2 : Value,
+                        DAG.getConstant(VecNum, dl, getPointerTy(DAG.getDataLayout())));
     } else
       Elt = DAG.getNode(PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, Value,
                         DAG.getConstant(VecNum, dl, getPointerTy(DAG.getDataLayout())));
 
@@ -1344,6 +1344,8 @@ namespace llvm {
 
     SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerDMFVectorLoad(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerDMFVectorStore(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue LowerCallResult(SDValue Chain, SDValue InGlue,
                             CallingConv::ID CallConv, bool isVarArg,
 
@@ -46,10 +46,10 @@ define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i
 ; CHECK-NEXT:    dmxxinstfdmr512 wacc0, vsp34, vsp36, 0
 ; CHECK-NEXT:    xvf16ger2pp wacc0, v28, v30
 ; CHECK-NEXT:    dmxxextfdmr512 wacc0, vsp34, vsp36, 0
-; CHECK-NEXT:    stxv v2, 48(r30)
-; CHECK-NEXT:    stxv v3, 32(r30)
-; CHECK-NEXT:    stxv v4, 16(r30)
-; CHECK-NEXT:    stxv v5, 0(r30)
+; CHECK-NEXT:    stxv v4, 48(r30)
+; CHECK-NEXT:    stxv v5, 32(r30)
+; CHECK-NEXT:    stxv v2, 16(r30)
+; CHECK-NEXT:    stxv v3, 0(r30)
 ; CHECK-NEXT:    lxv v31, 144(r1) # 16-byte Folded Reload
 ; CHECK-NEXT:    lxv v30, 128(r1) # 16-byte Folded Reload
 ; CHECK-NEXT:    lxv v29, 112(r1) # 16-byte Folded Reload