Reapply "AMDGPU: Move reg_sequence splat handling (llvm#140313)"

kerbowa · kerbowa · commit c14e9926f991 · 2025-10-07T19:46:31.000-07:00
This reverts commit c8d27ce.
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -219,7 +219,12 @@ class SIFoldOperandsImpl {
                         const FoldableDef &OpToFold) const;
   bool isUseSafeToFold(const MachineInstr &MI,
                        const MachineOperand &UseMO) const;
-  bool
+
+  const TargetRegisterClass *getRegSeqInit(
+      MachineInstr &RegSeq,
+      SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const;
+
+  const TargetRegisterClass *
   getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
                 Register UseReg) const;
 
@@ -935,19 +940,24 @@ static MachineOperand *lookUpCopyChain(const SIInstrInfo &TII,
   return Sub;
 }
 
-// Find a def of the UseReg, check if it is a reg_sequence and find initializers
-// for each subreg, tracking it to foldable inline immediate if possible.
-// Returns true on success.
-bool SIFoldOperandsImpl::getRegSeqInit(
-    SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
-    Register UseReg, uint8_t OpTy) const {
-  MachineInstr *Def = MRI->getVRegDef(UseReg);
-  if (!Def || !Def->isRegSequence())
-    return false;
+const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
+    MachineInstr &RegSeq,
+    SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const {
 
-  for (unsigned I = 1, E = Def->getNumExplicitOperands(); I != E; I += 2) {
-    MachineOperand &SrcOp = Def->getOperand(I);
-    unsigned SubRegIdx = Def->getOperand(I + 1).getImm();
+  assert(RegSeq.isRegSequence());
+
+  const TargetRegisterClass *RC = nullptr;
+
+  for (unsigned I = 1, E = RegSeq.getNumExplicitOperands(); I != E; I += 2) {
+    MachineOperand &SrcOp = RegSeq.getOperand(I);
+    unsigned SubRegIdx = RegSeq.getOperand(I + 1).getImm();
+
+    // Only accept reg_sequence with uniform reg class inputs for simplicity.
+    const TargetRegisterClass *OpRC = getRegOpRC(*MRI, *TRI, SrcOp);
+    if (!RC)
+      RC = OpRC;
+    else if (!TRI->getCommonSubClass(RC, OpRC))
+      return nullptr;
 
     if (SrcOp.getSubReg()) {
       // TODO: Handle subregister compose
@@ -956,8 +966,7 @@ bool SIFoldOperandsImpl::getRegSeqInit(
     }
 
     MachineOperand *DefSrc = lookUpCopyChain(*TII, *MRI, SrcOp.getReg());
-    if (DefSrc && (DefSrc->isReg() ||
-                   (DefSrc->isImm() && TII->isInlineConstant(*DefSrc, OpTy)))) {
+    if (DefSrc && (DefSrc->isReg() || DefSrc->isImm())) {
       Defs.emplace_back(DefSrc, SubRegIdx);
       continue;
     }
@@ -1112,7 +1121,6 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
   if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
     return false;
 
-  uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
   MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
   if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) {
     appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold);
@@ -1147,31 +1155,7 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
     }
   }
 
-  SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
-  if (!getRegSeqInit(Defs, UseReg, OpTy))
-    return false;
-
-  int32_t Imm;
-  for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
-    const MachineOperand *Op = Defs[I].first;
-    if (!Op->isImm())
-      return false;
-
-    auto SubImm = Op->getImm();
-    if (!I) {
-      Imm = SubImm;
-      if (!TII->isInlineConstant(*Op, OpTy) ||
-          !TII->isOperandLegal(*UseMI, UseOpIdx, Op))
-        return false;
-
-      continue;
-    }
-    if (Imm != SubImm)
-      return false; // Can only fold splat constants
-  }
-
-  appendFoldCandidate(FoldList, UseMI, UseOpIdx, Defs[0].first);
-  return true;
+  return false;
 }
 
 void SIFoldOperandsImpl::foldOperand(
@@ -1236,6 +1220,7 @@ void SIFoldOperandsImpl::foldOperand(
       foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
                   CopiesToReplace);
     }
+
     return;
   }
 
@@ -2387,7 +2372,7 @@ bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {
     return false;
 
   SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
-  if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER))
+  if (!getRegSeqInit(Defs, Reg))
     return false;
 
   for (auto &[Op, SubIdx] : Defs) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1346,7 +1346,7 @@ unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs) {
 
 unsigned getVGPRReductionToIncreaseWavesPerEU(const MCSubtargetInfo *STI,
                                               unsigned NumVGPRs) {
-  unsigned Granule = getVGPRAllocGranule(STI);
+  unsigned Granule = getVGPRAllocGranule(STI, 0, std::nullopt);
   unsigned MaxWaves = getMaxWavesPerEU(STI);
   unsigned TotalNumVGPRs = getTotalNumVGPRs(STI);
 
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -2769,7 +2769,6 @@ define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) {
 ; PACKED-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; PACKED-SDAG-NEXT:    s_endpgm
 ;
-<<<<<<< HEAD
 ; GFX90A-GISEL-LABEL: fma_v2_v_lit_splat:
 ; GFX90A-GISEL:       ; %bb.0:
 ; GFX90A-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24

Original file line number	Diff line number	Diff line change
`@@ -2769,7 +2769,6 @@ define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) {`
`2769`	`2769`	`; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]`
`2770`	`2770`	`; PACKED-SDAG-NEXT: s_endpgm`
`2771`	`2771`	`;`
`2772`		`-<<<<<<< HEAD`
`2773`	`2772`	`; GFX90A-GISEL-LABEL: fma_v2_v_lit_splat:`
`2774`	`2773`	`; GFX90A-GISEL: ; %bb.0:`
`2775`	`2774`	`; GFX90A-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24`