Revert "AMDGPU: Move reg_sequence splat handling (llvm#140313)"

jrbyrnes · jrbyrnes · commit dba8becf82c0 · 2025-05-21T19:46:07.000-07:00
This reverts commit 4ddab12. Change-Id: I861cf070d77cab094dda697ba4b4ce017407f1e0
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -119,22 +119,9 @@ class SIFoldOperandsImpl {
                         MachineOperand *OpToFold) const;
   bool isUseSafeToFold(const MachineInstr &MI,
                        const MachineOperand &UseMO) const;
-
-  const TargetRegisterClass *getRegSeqInit(
-      MachineInstr &RegSeq,
-      SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const;
-
-  const TargetRegisterClass *
+  bool
   getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
-                Register UseReg) const;
-
-  std::pair<MachineOperand *, const TargetRegisterClass *>
-  isRegSeqSplat(MachineInstr &RegSeg) const;
-
-  MachineOperand *tryFoldRegSeqSplat(MachineInstr *UseMI, unsigned UseOpIdx,
-                                     MachineOperand *SplatVal,
-                                     const TargetRegisterClass *SplatRC) const;
-
+                Register UseReg, uint8_t OpTy) const;
   bool tryToFoldACImm(MachineOperand &OpToFold, MachineInstr *UseMI,
                       unsigned UseOpIdx,
                       SmallVectorImpl<FoldCandidate> &FoldList) const;
@@ -838,24 +825,19 @@ static MachineOperand *lookUpCopyChain(const SIInstrInfo &TII,
   return Sub;
 }
 
-const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
-    MachineInstr &RegSeq,
-    SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const {
-
-  assert(RegSeq.isRegSequence());
-
-  const TargetRegisterClass *RC = nullptr;
-
-  for (unsigned I = 1, E = RegSeq.getNumExplicitOperands(); I != E; I += 2) {
-    MachineOperand &SrcOp = RegSeq.getOperand(I);
-    unsigned SubRegIdx = RegSeq.getOperand(I + 1).getImm();
+// Find a def of the UseReg, check if it is a reg_sequence and find initializers
+// for each subreg, tracking it to foldable inline immediate if possible.
+// Returns true on success.
+bool SIFoldOperandsImpl::getRegSeqInit(
+    SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
+    Register UseReg, uint8_t OpTy) const {
+  MachineInstr *Def = MRI->getVRegDef(UseReg);
+  if (!Def || !Def->isRegSequence())
+    return false;
 
-    // Only accept reg_sequence with uniform reg class inputs for simplicity.
-    const TargetRegisterClass *OpRC = getRegOpRC(*MRI, *TRI, SrcOp);
-    if (!RC)
-      RC = OpRC;
-    else if (!TRI->getCommonSubClass(RC, OpRC))
-      return nullptr;
+  for (unsigned I = 1, E = Def->getNumExplicitOperands(); I != E; I += 2) {
+    MachineOperand &SrcOp = Def->getOperand(I);
+    unsigned SubRegIdx = Def->getOperand(I + 1).getImm();
 
     if (SrcOp.getSubReg()) {
       // TODO: Handle subregister compose
@@ -864,106 +846,16 @@ const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
     }
 
     MachineOperand *DefSrc = lookUpCopyChain(*TII, *MRI, SrcOp.getReg());
-    if (DefSrc && (DefSrc->isReg() || DefSrc->isImm())) {
+    if (DefSrc && (DefSrc->isReg() ||
+                   (DefSrc->isImm() && TII->isInlineConstant(*DefSrc, OpTy)))) {
       Defs.emplace_back(DefSrc, SubRegIdx);
       continue;
     }
 
     Defs.emplace_back(&SrcOp, SubRegIdx);
   }
 
-  return RC;
-}
-
-// Find a def of the UseReg, check if it is a reg_sequence and find initializers
-// for each subreg, tracking it to an immediate if possible. Returns the
-// register class of the inputs on success.
-const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
-    SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
-    Register UseReg) const {
-  MachineInstr *Def = MRI->getVRegDef(UseReg);
-  if (!Def || !Def->isRegSequence())
-    return nullptr;
-
-  return getRegSeqInit(*Def, Defs);
-}
-
-std::pair<MachineOperand *, const TargetRegisterClass *>
-SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const {
-  SmallVector<std::pair<MachineOperand *, unsigned>, 32> Defs;
-  const TargetRegisterClass *SrcRC = getRegSeqInit(RegSeq, Defs);
-  if (!SrcRC)
-    return {};
-
-  // TODO: Recognize 64-bit splats broken into 32-bit pieces (i.e. recognize
-  // every other other element is 0 for 64-bit immediates)
-  int64_t Imm;
-  for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
-    const MachineOperand *Op = Defs[I].first;
-    if (!Op->isImm())
-      return {};
-
-    int64_t SubImm = Op->getImm();
-    if (!I) {
-      Imm = SubImm;
-      continue;
-    }
-    if (Imm != SubImm)
-      return {}; // Can only fold splat constants
-  }
-
-  return {Defs[0].first, SrcRC};
-}
-
-MachineOperand *SIFoldOperandsImpl::tryFoldRegSeqSplat(
-    MachineInstr *UseMI, unsigned UseOpIdx, MachineOperand *SplatVal,
-    const TargetRegisterClass *SplatRC) const {
-  const MCInstrDesc &Desc = UseMI->getDesc();
-  if (UseOpIdx >= Desc.getNumOperands())
-    return nullptr;
-
-  // Filter out unhandled pseudos.
-  if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
-    return nullptr;
-
-  int16_t RCID = Desc.operands()[UseOpIdx].RegClass;
-  if (RCID == -1)
-    return nullptr;
-
-  // Special case 0/-1, since when interpreted as a 64-bit element both halves
-  // have the same bits. Effectively this code does not handle 64-bit element
-  // operands correctly, as the incoming 64-bit constants are already split into
-  // 32-bit sequence elements.
-  //
-  // TODO: We should try to figure out how to interpret the reg_sequence as a
-  // split 64-bit splat constant, or use 64-bit pseudos for materializing f64
-  // constants.
-  if (SplatVal->getImm() != 0 && SplatVal->getImm() != -1) {
-    const TargetRegisterClass *OpRC = TRI->getRegClass(RCID);
-    // We need to figure out the scalar type read by the operand. e.g. the MFMA
-    // operand will be AReg_128, and we want to check if it's compatible with an
-    // AReg_32 constant.
-    uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
-    switch (OpTy) {
-    case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
-    case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
-      OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);
-      break;
-    case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
-      OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);
-      break;
-    default:
-      return nullptr;
-    }
-
-    if (!TRI->getCommonSubClass(OpRC, SplatRC))
-      return nullptr;
-  }
-
-  if (!TII->isOperandLegal(*UseMI, UseOpIdx, SplatVal))
-    return nullptr;
-
-  return SplatVal;
+  return true;
 }
 
 bool SIFoldOperandsImpl::tryToFoldACImm(
@@ -977,6 +869,7 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
   if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
     return false;
 
+  uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
   MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
   if (OpToFold.isImm()) {
     if (unsigned UseSubReg = UseOp.getSubReg()) {
@@ -1023,7 +916,31 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
     }
   }
 
-  return false;
+  SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
+  if (!getRegSeqInit(Defs, UseReg, OpTy))
+    return false;
+
+  int32_t Imm;
+  for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
+    const MachineOperand *Op = Defs[I].first;
+    if (!Op->isImm())
+      return false;
+
+    auto SubImm = Op->getImm();
+    if (!I) {
+      Imm = SubImm;
+      if (!TII->isInlineConstant(*Op, OpTy) ||
+          !TII->isOperandLegal(*UseMI, UseOpIdx, Op))
+        return false;
+
+      continue;
+    }
+    if (Imm != SubImm)
+      return false; // Can only fold splat constants
+  }
+
+  appendFoldCandidate(FoldList, UseMI, UseOpIdx, Defs[0].first);
+  return true;
 }
 
 void SIFoldOperandsImpl::foldOperand(
@@ -1053,34 +970,21 @@ void SIFoldOperandsImpl::foldOperand(
     Register RegSeqDstReg = UseMI->getOperand(0).getReg();
     unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
 
-    MachineOperand *SplatVal;
-    const TargetRegisterClass *SplatRC;
-    std::tie(SplatVal, SplatRC) = isRegSeqSplat(*UseMI);
-
     // Grab the use operands first
     SmallVector<MachineOperand *, 4> UsesToProcess(
         llvm::make_pointer_range(MRI->use_nodbg_operands(RegSeqDstReg)));
     for (auto *RSUse : UsesToProcess) {
       MachineInstr *RSUseMI = RSUse->getParent();
-      unsigned OpNo = RSUseMI->getOperandNo(RSUse);
 
-      if (SplatVal) {
-        if (MachineOperand *Foldable =
-                tryFoldRegSeqSplat(RSUseMI, OpNo, SplatVal, SplatRC)) {
-          appendFoldCandidate(FoldList, RSUseMI, OpNo, Foldable);
-          continue;
-        }
-      }
-
-      if (RSUse->getSubReg() != RegSeqDstSubReg)
+      if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI,
+                         RSUseMI->getOperandNo(RSUse), FoldList))
         continue;
 
-      if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI, OpNo, FoldList))
+      if (RSUse->getSubReg() != RegSeqDstSubReg)
         continue;
 
-      foldOperand(OpToFold, RSUseMI, OpNo, FoldList, CopiesToReplace);
+      foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList, CopiesToReplace);
     }
-
     return;
   }
 
@@ -2232,7 +2136,7 @@ bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {
     return false;
 
   SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
-  if (!getRegSeqInit(Defs, Reg))
+  if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER))
     return false;
 
   for (auto &[Op, SubIdx] : Defs) {
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -1713,6 +1713,7 @@ define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) {
 ; PACKED-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; PACKED-SDAG-NEXT:    s_endpgm
 ;
+<<<<<<< HEAD
 ; GFX90A-GISEL-LABEL: fma_v2_v_lit_splat:
 ; GFX90A-GISEL:       ; %bb.0:
 ; GFX90A-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -1746,6 +1747,19 @@ define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) {
 ; GFX942-GISEL-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
 ; GFX942-GISEL-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GFX942-GISEL-NEXT:    s_endpgm
+=======
+; PACKED-GISEL-LABEL: fma_v2_v_lit_splat:
+; PACKED-GISEL:       ; %bb.0:
+; PACKED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], 4.0, 1.0
+; PACKED-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-GISEL-NEXT:    s_endpgm
+>>>>>>> parent of 4ddab1252fe6... AMDGPU: Move reg_sequence splat handling (#140313)
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
   %load = load <2 x float>, ptr addrspace(1) %gep, align 8