Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2342,8 +2342,14 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;

def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
multiclass AMDGPUWaveReduceOps {
foreach Op =
["umin", "min", "umax", "max", "add", "sub", "and", "or", "xor"] in {
def Op : AMDGPUWaveReduce;
}
}

defm int_amdgcn_wave_reduce_ : AMDGPUWaveReduceOps;

def int_amdgcn_readfirstlane :
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
Expand Down
9 changes: 8 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5006,8 +5006,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
break;
}
case Intrinsic::amdgcn_wave_reduce_add:
case Intrinsic::amdgcn_wave_reduce_sub:
case Intrinsic::amdgcn_wave_reduce_min:
case Intrinsic::amdgcn_wave_reduce_umin:
case Intrinsic::amdgcn_wave_reduce_umax: {
case Intrinsic::amdgcn_wave_reduce_max:
case Intrinsic::amdgcn_wave_reduce_umax:
case Intrinsic::amdgcn_wave_reduce_and:
case Intrinsic::amdgcn_wave_reduce_or:
case Intrinsic::amdgcn_wave_reduce_xor: {
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
Expand Down
122 changes: 111 additions & 11 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5038,6 +5038,28 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
return LoopBB;
}

static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
switch (Opc) {
case AMDGPU::S_MIN_U32:
return std::numeric_limits<uint32_t>::max();
case AMDGPU::S_MIN_I32:
return std::numeric_limits<int32_t>::max();
case AMDGPU::S_MAX_U32:
return std::numeric_limits<uint32_t>::min();
case AMDGPU::S_MAX_I32:
return std::numeric_limits<int32_t>::min();
case AMDGPU::S_ADD_I32:
case AMDGPU::S_SUB_I32:
case AMDGPU::S_OR_B32:
case AMDGPU::S_XOR_B32:
return std::numeric_limits<uint32_t>::min();
case AMDGPU::S_AND_B32:
return std::numeric_limits<uint32_t>::max();
default:
llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
}
}

static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
MachineBasicBlock &BB,
const GCNSubtarget &ST,
Expand All @@ -5053,13 +5075,78 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
Register DstReg = MI.getOperand(0).getReg();
MachineBasicBlock *RetBB = nullptr;
if (isSGPR) {
// These operations with a uniform value i.e. SGPR are idempotent.
// Reduced value will be same as given sgpr.
// clang-format off
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
.addReg(SrcReg);
// clang-format on
RetBB = &BB;
switch (Opc) {
case AMDGPU::S_MIN_U32:
case AMDGPU::S_MIN_I32:
case AMDGPU::S_MAX_U32:
case AMDGPU::S_MAX_I32:
case AMDGPU::S_AND_B32:
case AMDGPU::S_OR_B32: {
// Idempotent operations.
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
RetBB = &BB;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't really follow what RetBB is for, it's always identical to BB?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

RetBB is set to ComputeEnd incase the basic block is split and new ones are inserted.

break;
}
case AMDGPU::S_XOR_B32:
case AMDGPU::S_ADD_I32:
case AMDGPU::S_SUB_I32: {
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);

bool IsWave32 = ST.isWave32();
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
unsigned CountReg =
IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;

auto Exec =
BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);

auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
.addReg(Exec->getOperand(0).getReg());

switch (Opc) {
case AMDGPU::S_XOR_B32: {
// Performing an XOR operation on a uniform value
// depends on the parity of the number of active lanes.
// For even parity, the result will be 0, for odd
// parity the result will be the same as the input value.
Register ParityRegister = MRI.createVirtualRegister(DstRegClass);

auto ParityReg =
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
.addReg(NewAccumulator->getOperand(0).getReg())
.addImm(1);
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
.addReg(SrcReg)
.addReg(ParityReg->getOperand(0).getReg());
break;
}
case AMDGPU::S_SUB_I32: {
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);

// Take the negation of the source operand.
auto InvertedValReg =
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
.addImm(-1)
.addReg(SrcReg);
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
.addReg(InvertedValReg->getOperand(0).getReg())
.addReg(NewAccumulator->getOperand(0).getReg());
break;
}
case AMDGPU::S_ADD_I32: {
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
.addReg(SrcReg)
.addReg(NewAccumulator->getOperand(0).getReg());
break;
}
}
RetBB = &BB;
}
}
} else {
// TODO: Implement DPP Strategy and switch based on immediate strategy
// operand. For now, for all the cases (default, Iterative and DPP we use
Expand Down Expand Up @@ -5095,10 +5182,9 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;

// Create initail values of induction variable from Exec, Accumulator and
// insert branch instr to newly created ComputeBlockk
uint32_t InitalValue =
(Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
// Create initial values of induction variable from Exec, Accumulator and
// insert branch instr to newly created ComputeBlock
uint32_t InitalValue = getIdentityValueForWaveReduction(Opc);
auto TmpSReg =
BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
Expand Down Expand Up @@ -5170,8 +5256,22 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
switch (MI.getOpcode()) {
case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
case AMDGPU::S_UADDO_PSEUDO:
case AMDGPU::S_USUBO_PSEUDO: {
const DebugLoc &DL = MI.getDebugLoc();
Expand Down
29 changes: 21 additions & 8 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -303,16 +303,29 @@ def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
(V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;

let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> {
// clang-format off
defvar int_amdgcn_wave_reduce_ = "int_amdgcn_wave_reduce_";
multiclass
AMDGPUWaveReducePseudoGenerator<string Op, string DataType> {
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
def !toupper(Op) #"_PSEUDO_" #DataType
: VPseudoInstSI<(outs SGPR_32 : $sdst),
(ins VSrc_b32 : $src, VSrc_b32 : $strategy),
[(set i32 : $sdst, (!cast<AMDGPUWaveReduce>(int_amdgcn_wave_reduce_ #Op) i32 : $src, i32 : $strategy))]> {}
}
}
// clang-format on

def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
}
// Input list : [Operation_name,
// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B)]
defvar Operations = [
["umin", "U32"], ["min", "I32"], ["umax", "U32"], ["max", "I32"],
["add", "I32"], ["sub", "I32"], ["and", "B32"], ["or", "B32"],
["xor", "B32"]
];

foreach Op = Operations in {
defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op[0], Op[1]>;
}

let usesCustomInserter = 1, Defs = [VCC] in {
Expand Down
Loading
Loading