Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 114 additions & 3 deletions llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,7 @@ class SIFoldOperandsImpl {
std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
bool tryFoldOMod(MachineInstr &MI);
bool tryFoldRegSequence(MachineInstr &MI);
bool tryFoldImmRegSequence(MachineInstr &MI);
bool tryFoldPhiAGPR(MachineInstr &MI);
bool tryFoldLoad(MachineInstr &MI);

Expand Down Expand Up @@ -2331,6 +2332,114 @@ bool SIFoldOperandsImpl::tryFoldOMod(MachineInstr &MI) {
return true;
}

// gfx942+ can use V_MOV_B64 for materializing constant immediates.
// For example:
// %0:vgpr_32 = V_MOV_B32 0, implicit $exec
// %1:vreg_64_align2 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1
// ->
// %1:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
bool SIFoldOperandsImpl::tryFoldImmRegSequence(MachineInstr &MI) {
assert(MI.isRegSequence() &&
"MachineInstr is not expected REG_SEQUENCE instruction");
Register Reg = MI.getOperand(0).getReg();
const TargetRegisterClass *DefRC = MRI->getRegClass(Reg);
const MCInstrDesc &MovDesc = TII->get(AMDGPU::V_MOV_B64_PSEUDO);
const TargetRegisterClass *RC =
TII->getRegClass(MovDesc, 0, TRI, *MI.getMF());

if (!ST->hasMovB64() || !TRI->isVGPR(*MRI, Reg) ||
!MRI->hasOneNonDBGUse(Reg) ||
(!TRI->getCompatibleSubRegClass(DefRC, RC, AMDGPU::sub0_sub1) &&
DefRC != RC))
return false;

SmallVector<std::pair<MachineOperand *, unsigned>, 32> Defs;
if (!getRegSeqInit(Defs, Reg))
return false;

// Only attempt to fold immediate materializations.
if (!Defs.empty() &&
llvm::any_of(Defs, [](const std::pair<MachineOperand *, unsigned> &Op) {
return !Op.first->isImm();
}))
return false;

SmallVector<uint64_t, 8> ImmVals;
uint64_t ImmVal = 0;
uint64_t ImmSize = 0;
uint64_t RemainderSize = TRI->getRegSizeInBits(*DefRC);
SmallVector<std::pair<MachineOperand *, unsigned>, 4> Remainders;
for (auto &[Op, SubIdx] : Defs) {
unsigned SubRegSize = TRI->getSubRegIdxSize(SubIdx);
unsigned Shift = (TRI->getChannelFromSubReg(SubIdx) % 2) * SubRegSize;
ImmSize += SubRegSize;
ImmVal |= Op->getImm() << Shift;

if (SubRegSize == 64)
return false;

if (ImmSize == 64) {
// Only 32 bit literals can be encoded.
if (!isUInt<32>(ImmVal))
return false;
ImmVals.push_back(ImmVal);
ImmVal = 0;
ImmSize = 0;
RemainderSize -= 64;
} else if ((RemainderSize / 64 == 0) && (RemainderSize % 64 != 0)) {
// There's some remainder to consider.
Remainders.push_back({Op, SubRegSize});
}
}

// Can only combine REG_SEQUENCE into one 64b immediate materialization mov.
if (DefRC == RC) {
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), MovDesc, Reg)
.addImm(ImmVals[0]);
MI.eraseFromParent();
return true;
}

if (ImmVals.size() == 1 && RemainderSize == 0)
return false;

// Can't bail from here on out: modifying the MI.

// Remove source operands.
for (unsigned i = MI.getNumOperands() - 1; i > 0; --i)
MI.removeOperand(i);

unsigned Ch = 0;
for (uint64_t Val : ImmVals) {
Register MovReg = MRI->createVirtualRegister(RC);
// Duplicate vmov imm materializations (e.g., splatted operands) should get
// combined by MachineCSE pass.
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
TII->get(AMDGPU::V_MOV_B64_PSEUDO), MovReg)
.addImm(Val);

// 2 subregs with no overlap (i.e., sub0_sub1, sub2_sub3, etc.).
unsigned SubReg64B =
SIRegisterInfo::getSubRegFromChannel(/*Channel=*/Ch * 2, /*SubRegs=*/2);

MI.addOperand(MachineOperand::CreateReg(MovReg, /*isDef=*/false));
MI.addOperand(MachineOperand::CreateImm(SubReg64B));
++Ch;
}
Comment on lines +2412 to +2428
Copy link
Member

@ritter-x2a ritter-x2a Jun 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would this try to fold a 3*32-bit reg sequence into two 64-bit movs (which, I'd guess, would break in some way because of mismatched register types)? Or are odd-sized reg sequences impossible here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for the late reply, I had been trying to solve your described scenario in a PR that I was expecting to supersede this one: #145052 but to no avail so I have now fixed it in this PR (although, I do wonder if the BUILD_VECTOR -> REG_SEQUENCE isel might be a better location for this...)
It now rewrites REG_SEQUENCE for 64b up until possible and falls back on i32 for the remainder (e.g., v3i32 -> 1x v_mov_b64 + 1x v_mov_b32).

Ch *= 2;
for (auto &[Op, Size] : Remainders) {
unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(/*Channel=*/Ch);
MachineOperand &Mov = Op->getParent()->getOperand(0);
MI.addOperand(MachineOperand::CreateReg(Mov.getReg(), /*isDef=*/false));
MI.addOperand(MachineOperand::CreateImm(SubReg));
++Ch;
}

LLVM_DEBUG(dbgs() << "Folded into " << MI);

return true;
}

// Try to fold a reg_sequence with vgpr output and agpr inputs into an
// instruction which can take an agpr. So far that means a store.
bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {
Expand Down Expand Up @@ -2760,9 +2869,11 @@ bool SIFoldOperandsImpl::run(MachineFunction &MF) {
continue;
}

if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
Changed = true;
continue;
if (MI.isRegSequence()) {
if (tryFoldImmRegSequence(MI) || tryFoldRegSequence(MI)) {
Changed = true;
continue;
}
}

if (MI.isPHI() && tryFoldPhiAGPR(MI)) {
Expand Down
12 changes: 4 additions & 8 deletions llvm/test/CodeGen/AMDGPU/flat-scratch.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4139,8 +4139,7 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
; GFX942-LABEL: store_load_i64_aligned:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v2, 15
; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: v_mov_b64_e32 v[2:3], 15
; GFX942-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
Expand Down Expand Up @@ -4250,8 +4249,7 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
; GFX942-LABEL: store_load_i64_unaligned:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v2, 15
; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: v_mov_b64_e32 v[2:3], 15
; GFX942-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
Expand Down Expand Up @@ -5010,10 +5008,8 @@ define amdgpu_ps void @large_offset() {
;
; GFX942-LABEL: large_offset:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mov_b32_e32 v1, v0
; GFX942-NEXT: v_mov_b32_e32 v2, v0
; GFX942-NEXT: v_mov_b32_e32 v3, v0
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0
; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:3024 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: scratch_load_dwordx4 v[0:3], off, off offset:3024 sc0 sc1
Expand Down
Loading