-
Notifications
You must be signed in to change notification settings - Fork 15.4k
AMDGPU: Move reg_sequence splat handling #140313
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Move reg_sequence splat handling #140313
Conversation
This code clunkily tried to find a splat reg_sequence by looking at every use of the reg_sequence, and then looking back at the reg_sequence to see if it's a splat. Extract this into a separate helper function to help clean this up. This now parses whether the reg_sequence forms a splat once, and defers the legal inline immediate check to the use check (which is really use context dependent) The one regression is in globalisel, which has an extra copy that should have been separately folded out. It was getting dealt with by the handling of foldable copies in tryToFoldACImm. This is preparation for #139317
|
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesThis code clunkily tried to find a splat reg_sequence by The one regression is in globalisel, which has an extra This is preparation for #139317 Full diff: https://github.com/llvm/llvm-project/pull/140313.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 633a858df364e..92937e33fd500 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -119,9 +119,22 @@ class SIFoldOperandsImpl {
MachineOperand *OpToFold) const;
bool isUseSafeToFold(const MachineInstr &MI,
const MachineOperand &UseMO) const;
- bool
+
+ const TargetRegisterClass *getRegSeqInit(
+ MachineInstr &RegSeq,
+ SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const;
+
+ const TargetRegisterClass *
getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
- Register UseReg, uint8_t OpTy) const;
+ Register UseReg) const;
+
+ std::pair<MachineOperand *, const TargetRegisterClass *>
+ isRegSeqSplat(MachineInstr &RegSeg) const;
+
+ MachineOperand *tryFoldRegSeqSplat(MachineInstr *UseMI, unsigned UseOpIdx,
+ MachineOperand *SplatVal,
+ const TargetRegisterClass *SplatRC) const;
+
bool tryToFoldACImm(MachineOperand &OpToFold, MachineInstr *UseMI,
unsigned UseOpIdx,
SmallVectorImpl<FoldCandidate> &FoldList) const;
@@ -825,19 +838,24 @@ static MachineOperand *lookUpCopyChain(const SIInstrInfo &TII,
return Sub;
}
-// Find a def of the UseReg, check if it is a reg_sequence and find initializers
-// for each subreg, tracking it to foldable inline immediate if possible.
-// Returns true on success.
-bool SIFoldOperandsImpl::getRegSeqInit(
- SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
- Register UseReg, uint8_t OpTy) const {
- MachineInstr *Def = MRI->getVRegDef(UseReg);
- if (!Def || !Def->isRegSequence())
- return false;
+const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
+ MachineInstr &RegSeq,
+ SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const {
- for (unsigned I = 1, E = Def->getNumExplicitOperands(); I != E; I += 2) {
- MachineOperand &SrcOp = Def->getOperand(I);
- unsigned SubRegIdx = Def->getOperand(I + 1).getImm();
+ assert(RegSeq.isRegSequence());
+
+ const TargetRegisterClass *RC = nullptr;
+
+ for (unsigned I = 1, E = RegSeq.getNumExplicitOperands(); I != E; I += 2) {
+ MachineOperand &SrcOp = RegSeq.getOperand(I);
+ unsigned SubRegIdx = RegSeq.getOperand(I + 1).getImm();
+
+ // Only accept reg_sequence with uniform reg class inputs for simplicity.
+ const TargetRegisterClass *OpRC = getRegOpRC(*MRI, *TRI, SrcOp);
+ if (!RC)
+ RC = OpRC;
+ else if (!TRI->getCommonSubClass(RC, OpRC))
+ return nullptr;
if (SrcOp.getSubReg()) {
// TODO: Handle subregister compose
@@ -846,8 +864,7 @@ bool SIFoldOperandsImpl::getRegSeqInit(
}
MachineOperand *DefSrc = lookUpCopyChain(*TII, *MRI, SrcOp.getReg());
- if (DefSrc && (DefSrc->isReg() ||
- (DefSrc->isImm() && TII->isInlineConstant(*DefSrc, OpTy)))) {
+ if (DefSrc && (DefSrc->isReg() || DefSrc->isImm())) {
Defs.emplace_back(DefSrc, SubRegIdx);
continue;
}
@@ -855,7 +872,65 @@ bool SIFoldOperandsImpl::getRegSeqInit(
Defs.emplace_back(&SrcOp, SubRegIdx);
}
- return true;
+ return RC;
+}
+
+// Find a def of the UseReg, check if it is a reg_sequence and find initializers
+// for each subreg, tracking it to an immediate if possible. Returns the
+// register class of the inputs on success.
+const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
+ SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
+ Register UseReg) const {
+ MachineInstr *Def = MRI->getVRegDef(UseReg);
+ if (!Def || !Def->isRegSequence())
+ return nullptr;
+
+ return getRegSeqInit(*Def, Defs);
+}
+
+std::pair<MachineOperand *, const TargetRegisterClass *>
+SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const {
+ SmallVector<std::pair<MachineOperand *, unsigned>, 32> Defs;
+ const TargetRegisterClass *SrcRC = getRegSeqInit(RegSeq, Defs);
+ if (!SrcRC)
+ return {};
+
+ int64_t Imm;
+ for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
+ const MachineOperand *Op = Defs[I].first;
+ if (!Op->isImm())
+ return {};
+
+ int64_t SubImm = Op->getImm();
+ if (!I) {
+ Imm = SubImm;
+ continue;
+ }
+ if (Imm != SubImm)
+ return {}; // Can only fold splat constants
+ }
+
+ return {Defs[0].first, SrcRC};
+}
+
+MachineOperand *SIFoldOperandsImpl::tryFoldRegSeqSplat(
+ MachineInstr *UseMI, unsigned UseOpIdx, MachineOperand *SplatVal,
+ const TargetRegisterClass *SplatRC) const {
+ const MCInstrDesc &Desc = UseMI->getDesc();
+ if (UseOpIdx >= Desc.getNumOperands())
+ return nullptr;
+
+ // Filter out unhandled pseudos.
+ if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
+ return nullptr;
+
+ // FIXME: Verify SplatRC is compatible with the use operand
+ uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
+ if (!TII->isInlineConstant(*SplatVal, OpTy) ||
+ !TII->isOperandLegal(*UseMI, UseOpIdx, SplatVal))
+ return nullptr;
+
+ return SplatVal;
}
bool SIFoldOperandsImpl::tryToFoldACImm(
@@ -869,7 +944,6 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
return false;
- uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
if (OpToFold.isImm()) {
if (unsigned UseSubReg = UseOp.getSubReg()) {
@@ -916,31 +990,7 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
}
}
- SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
- if (!getRegSeqInit(Defs, UseReg, OpTy))
- return false;
-
- int32_t Imm;
- for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
- const MachineOperand *Op = Defs[I].first;
- if (!Op->isImm())
- return false;
-
- auto SubImm = Op->getImm();
- if (!I) {
- Imm = SubImm;
- if (!TII->isInlineConstant(*Op, OpTy) ||
- !TII->isOperandLegal(*UseMI, UseOpIdx, Op))
- return false;
-
- continue;
- }
- if (Imm != SubImm)
- return false; // Can only fold splat constants
- }
-
- appendFoldCandidate(FoldList, UseMI, UseOpIdx, Defs[0].first);
- return true;
+ return false;
}
void SIFoldOperandsImpl::foldOperand(
@@ -970,14 +1020,26 @@ void SIFoldOperandsImpl::foldOperand(
Register RegSeqDstReg = UseMI->getOperand(0).getReg();
unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
+ MachineOperand *SplatVal;
+ const TargetRegisterClass *SplatRC;
+ std::tie(SplatVal, SplatRC) = isRegSeqSplat(*UseMI);
+
// Grab the use operands first
SmallVector<MachineOperand *, 4> UsesToProcess(
llvm::make_pointer_range(MRI->use_nodbg_operands(RegSeqDstReg)));
for (auto *RSUse : UsesToProcess) {
MachineInstr *RSUseMI = RSUse->getParent();
+ unsigned OpNo = RSUseMI->getOperandNo(RSUse);
- if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI,
- RSUseMI->getOperandNo(RSUse), FoldList))
+ if (SplatVal) {
+ if (MachineOperand *Foldable =
+ tryFoldRegSeqSplat(RSUseMI, OpNo, SplatVal, SplatRC)) {
+ appendFoldCandidate(FoldList, RSUseMI, OpNo, Foldable);
+ continue;
+ }
+ }
+
+ if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI, OpNo, FoldList))
continue;
if (RSUse->getSubReg() != RegSeqDstSubReg)
@@ -986,6 +1048,7 @@ void SIFoldOperandsImpl::foldOperand(
foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
CopiesToReplace);
}
+
return;
}
@@ -2137,7 +2200,7 @@ bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {
return false;
SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
- if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER))
+ if (!getRegSeqInit(Defs, Reg))
return false;
for (auto &[Op, SubIdx] : Defs) {
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index ecd1abce67571..ddc3e770767b8 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -1680,10 +1680,12 @@ define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) {
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-GISEL-NEXT: s_mov_b32 s2, 1.0
+; PACKED-GISEL-NEXT: s_mov_b32 s3, s2
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
-; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], 4.0, 1.0
+; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], 4.0, s[2:3]
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
This reverts commit 4ddab12. Change-Id: I861cf070d77cab094dda697ba4b4ce017407f1e0
This reverts commit 4ddab12. Change-Id: I861cf070d77cab094dda697ba4b4ce017407f1e0
This reverts commit 4ddab12. Change-Id: I861cf070d77cab094dda697ba4b4ce017407f1e0
This reverts commit c8d27ce.

This code clunkily tried to find a splat reg_sequence by
looking at every use of the reg_sequence, and then looking
back at the reg_sequence to see if it's a splat. Extract this
into a separate helper function to help clean this up. This now
parses whether the reg_sequence forms a splat once, and defers the
legal inline immediate check to the use check (which is really use
context dependent)
The one regression is in globalisel, which has an extra
copy that should have been separately folded out. It was getting
dealt with by the handling of foldable copies in tryToFoldACImm.
This is preparation for #139908 and #139317