Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 148 additions & 0 deletions llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ class SIFoldOperandsImpl {
std::optional<int64_t> getImmOrMaterializedImm(MachineOperand &Op) const;
bool tryConstantFoldOp(MachineInstr *MI) const;
bool tryFoldCndMask(MachineInstr &MI) const;
bool tryScalarizeReadLaneSrc(MachineInstr &MI) const;
bool tryFoldZeroHighBits(MachineInstr &MI) const;
bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;

Expand Down Expand Up @@ -1407,6 +1408,148 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
return true;
}

static unsigned
getScalarizedReadLaneSrcOpc(const GCNSubtarget &ST, unsigned Opc,
SmallVectorImpl<MachineOperand *> &Ops) {
// Opcodes here are added as-needed because there are hundreds of
// instructions we could convert, but realistically we only need
// the most frequent ones to make an impact.
//
// The InstCombine version of this transform will do the heavy
// lifting, this is just a cleanup for the readlanes added during
// lowering.
switch (Opc) {
case AMDGPU::V_OR_B32_e32:
case AMDGPU::V_OR_B32_e64:
return AMDGPU::S_OR_B32;
case AMDGPU::V_MUL_HI_U32_e64:
if (ST.getGeneration() >= GCNSubtarget::GFX9)
return AMDGPU::S_MUL_HI_U32;
break;
case AMDGPU::V_AND_B32_e32:
case AMDGPU::V_AND_B32_e64:
return AMDGPU::S_AND_B32;
case AMDGPU::V_LSHRREV_B32_e32: // dst = S1 >> S0
case AMDGPU::V_LSHRREV_B32_e64:
std::swap(Ops[0], Ops[1]); // dst = S0 >> S1 (!)
return AMDGPU::S_LSHR_B32;
case AMDGPU::V_CVT_U32_F32_e32:
case AMDGPU::V_CVT_U32_F32_e64:
if (ST.hasSALUFloatInsts())
return AMDGPU::S_CVT_U32_F32;
break;
case AMDGPU::V_MIN_U32_e32:
case AMDGPU::V_MIN_U32_e64:
return AMDGPU::S_MIN_U32;
case AMDGPU::V_MIN_I32_e32:
case AMDGPU::V_MIN_I32_e64:
return AMDGPU::S_MIN_I32;
case AMDGPU::V_MAX_U32_e32:
case AMDGPU::V_MAX_U32_e64:
return AMDGPU::S_MAX_U32;
case AMDGPU::V_MAX_I32_e32:
case AMDGPU::V_MAX_I32_e64:
return AMDGPU::S_MAX_I32;
default:
break;
}

return -1;
}

// Try to transform
// %0:vgpr = (valu op) %x:vgpr
// %1:sgpr = v_readfirstlane %0
// Into
// %0:sgpr = v_readfirstlane %x:vgpr
// %1:sgpr = (salu op) %0
bool SIFoldOperandsImpl::tryScalarizeReadLaneSrc(MachineInstr &MI) const {
const unsigned Opc = MI.getOpcode();
if (Opc != AMDGPU::V_READFIRSTLANE_B32 && Opc != AMDGPU::V_READLANE_B32)
return false;

const auto VSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
const Register VSrc = MI.getOperand(VSrcIdx).getReg();

if (!MRI->hasOneNonDBGUse(VSrc))
return false;

MachineInstr *VSrcDef = MRI->getVRegDef(VSrc);
// Need a unary or binary VALU instruction as operand.
if (!VSrcDef || (VSrcDef->getParent() != MI.getParent()) ||
!TII->isVALU(*VSrcDef) || VSrcDef->getNumExplicitOperands() > 3 ||
execMayBeModifiedBeforeUse(*MRI, VSrc, *VSrcDef, MI))
return false;

const bool IsReadLane = (Opc == AMDGPU::V_READLANE_B32);
if (IsReadLane) {
MachineOperand &LaneOp = MI.getOperand(2);
if (LaneOp.isReg()) { // Can the lane be an imm?
Register LaneReg = LaneOp.getReg();
for (auto It = VSrcDef->getIterator(); It != MI.getIterator(); ++It) {
if (It->modifiesRegister(LaneReg, TRI))
return false;
}
}
}

SmallVector<MachineOperand *, 2> Ops;
MachineOperand *TargetOp = nullptr;
for (MachineOperand &SrcOp : VSrcDef->operands()) {
if (SrcOp.isReg()) {
if (SrcOp.isImplicit() || SrcOp.isDef())
continue;

Ops.push_back(&SrcOp);

Register Reg = SrcOp.getReg();
if (TRI->isVectorRegister(*MRI, Reg)) {
// This only works if we have one VGPR src.
if (TargetOp)
return false;
TargetOp = &SrcOp;
}
} else {
Ops.push_back(&SrcOp); // also collect imms
}
}
if (!TargetOp)
return false;

LLVM_DEBUG(dbgs() << "tryScalarizeReadLaneSrc:\n\treadlane: " << MI
<< "\tsrc: " << *VSrcDef << "\top: " << *TargetOp << "\n");

const unsigned ScalarOp =
getScalarizedReadLaneSrcOpc(*ST, VSrcDef->getOpcode(), Ops);
if (ScalarOp == unsigned(-1))
return false;

// We only support unary/binary ops.
assert(Ops.size() <= 2);

MachineBasicBlock *MBB = VSrcDef->getParent();
auto InsertBefore = VSrcDef->getIterator();
const DebugLoc &DL = VSrcDef->getDebugLoc();
Register SDst = MI.getOperand(0).getReg();

Register STargetOp = MRI->createVirtualRegister(MRI->getRegClass(SDst));
auto NewMI = BuildMI(*MBB, InsertBefore, DL, MI.getDesc(), STargetOp)
.addReg(TargetOp->getReg());
if (IsReadLane)
NewMI.add(MI.getOperand(2)); // lane index
auto ScalarMI = BuildMI(*MBB, InsertBefore, DL, TII->get(ScalarOp), SDst);
for (MachineOperand *Op : Ops) {
if (Op == TargetOp)
ScalarMI.addReg(STargetOp);
else
ScalarMI.add(*Op);
}

VSrcDef->eraseFromParent();
MI.eraseFromParent();
return true;
}

bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const {
if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
MI.getOpcode() != AMDGPU::V_AND_B32_e32)
Expand Down Expand Up @@ -2353,6 +2496,11 @@ bool SIFoldOperandsImpl::run(MachineFunction &MF) {
for (auto &MI : make_early_inc_range(*MBB)) {
Changed |= tryFoldCndMask(MI);

if (tryScalarizeReadLaneSrc(MI)) {
Changed = true;
continue;
}

if (tryFoldZeroHighBits(MI)) {
Changed = true;
continue;
Expand Down
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -730,18 +730,18 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX6-NEXT: s_lshl_b32 s3, s4, 24
; GFX6-NEXT: s_lshl_b32 s4, s7, 24
; GFX6-NEXT: s_not_b32 s5, s3
; GFX6-NEXT: s_min_u32 s4, s5, s4
; GFX6-NEXT: s_lshr_b32 s1, s1, 24
; GFX6-NEXT: s_min_u32 s4, s5, s4
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: s_lshr_b32 s2, s2, 24
; GFX6-NEXT: s_add_i32 s3, s3, s4
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: s_lshr_b32 s3, s3, 24
; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24
; GFX6-NEXT: s_lshr_b32 s3, s3, 24
; GFX6-NEXT: s_lshl_b32 s0, s2, 16
; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
; GFX6-NEXT: s_lshl_b32 s0, s3, 24
; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-NEXT: v_readfirstlane_b32 s1, v0
; GFX6-NEXT: s_or_b32 s0, s1, s0
; GFX6-NEXT: s_lshl_b32 s1, s3, 24
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_uaddsat_v4i8:
Expand Down Expand Up @@ -1020,8 +1020,8 @@ define amdgpu_ps i24 @s_uaddsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
; GFX8-NEXT: v_mov_b32_e32 v0, s1
; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 clamp
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: s_lshr_b32 s0, s0, 8
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_uaddsat_i24:
Expand All @@ -1030,17 +1030,17 @@ define amdgpu_ps i24 @s_uaddsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
; GFX9-NEXT: s_lshl_b32 s0, s0, 8
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: s_lshr_b32 s0, s0, 8
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_uaddsat_i24:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
; GFX10PLUS-NEXT: v_add_nc_u32_e64 v0, s0, s1 clamp
; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 8
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i24 @llvm.uadd.sat.i24(i24 %lhs, i24 %rhs)
ret i24 %result
Expand Down
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -714,18 +714,18 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX6-NEXT: s_sub_i32 s2, s2, s3
; GFX6-NEXT: s_lshl_b32 s3, s4, 24
; GFX6-NEXT: s_lshl_b32 s4, s7, 24
; GFX6-NEXT: s_min_u32 s4, s3, s4
; GFX6-NEXT: s_lshr_b32 s1, s1, 24
; GFX6-NEXT: s_min_u32 s4, s3, s4
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: s_lshr_b32 s2, s2, 24
; GFX6-NEXT: s_sub_i32 s3, s3, s4
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: s_lshr_b32 s3, s3, 24
; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24
; GFX6-NEXT: s_lshr_b32 s3, s3, 24
; GFX6-NEXT: s_lshl_b32 s0, s2, 16
; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
; GFX6-NEXT: s_lshl_b32 s0, s3, 24
; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-NEXT: v_readfirstlane_b32 s1, v0
; GFX6-NEXT: s_or_b32 s0, s1, s0
; GFX6-NEXT: s_lshl_b32 s1, s3, 24
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_usubsat_v4i8:
Expand Down Expand Up @@ -1002,8 +1002,8 @@ define amdgpu_ps i24 @s_usubsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
; GFX8-NEXT: v_mov_b32_e32 v0, s1
; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s0, v0 clamp
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: s_lshr_b32 s0, s0, 8
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_usubsat_i24:
Expand All @@ -1012,17 +1012,17 @@ define amdgpu_ps i24 @s_usubsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
; GFX9-NEXT: s_lshl_b32 s0, s0, 8
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: s_lshr_b32 s0, s0, 8
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_usubsat_i24:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, s0, s1 clamp
; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 8
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i24 @llvm.usub.sat.i24(i24 %lhs, i24 %rhs)
ret i24 %result
Expand Down
Loading