Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2119,8 +2119,14 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;

def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
multiclass AMDGPUWaveReduceGenerator<list<string> Operations> {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rename Operations with WaveReduceOps ?

foreach Op = Operations in {
def Op : AMDGPUWaveReduce;
}
}

defvar Operations = ["umin", "min", "umax", "max", "uadd", "add", "usub", "sub", "and", "or", "xor"];
defm int_amdgcn_wave_reduce_ : AMDGPUWaveReduceGenerator<Operations>;

def int_amdgcn_readfirstlane :
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
Expand Down
13 changes: 11 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4846,8 +4846,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
break;
}
case Intrinsic::amdgcn_wave_reduce_umin:
case Intrinsic::amdgcn_wave_reduce_umax: {
case Intrinsic::amdgcn_wave_reduce_add:
case Intrinsic::amdgcn_wave_reduce_uadd:
case Intrinsic::amdgcn_wave_reduce_sub:
case Intrinsic::amdgcn_wave_reduce_usub:
case Intrinsic::amdgcn_wave_reduce_min:
case Intrinsic::amdgcn_wave_reduce_umin:
case Intrinsic::amdgcn_wave_reduce_max:
case Intrinsic::amdgcn_wave_reduce_umax:
case Intrinsic::amdgcn_wave_reduce_and:
case Intrinsic::amdgcn_wave_reduce_or:
case Intrinsic::amdgcn_wave_reduce_xor: {
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
Expand Down
117 changes: 110 additions & 7 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4846,6 +4846,26 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
return LoopBB;
}

static uint32_t getInitialValueForWaveReduction(unsigned Opc){
switch(Opc){
case AMDGPU::S_MIN_U32:
return std::numeric_limits<uint32_t>::max();
case AMDGPU::S_MIN_I32:
return std::numeric_limits<int32_t>::max();
case AMDGPU::S_MAX_U32:
return 0;
case AMDGPU::S_MAX_I32:
return std::numeric_limits<int32_t>::min();
case AMDGPU::S_ADD_I32:
case AMDGPU::S_SUB_I32:
case AMDGPU::S_OR_B32:
case AMDGPU::S_XOR_B32:
return 0x00000000;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Literal values are not consistent form here (0, 0x00000000, 0xFFFFFFFF). Can we not query this using std::numeric_limits here also?

case AMDGPU::S_AND_B32:
return 0xFFFFFFFF;
}
}

static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
MachineBasicBlock &BB,
const GCNSubtarget &ST,
Expand All @@ -4861,10 +4881,75 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
Register DstReg = MI.getOperand(0).getReg();
MachineBasicBlock *RetBB = nullptr;
if (isSGPR) {
// These operations with a uniform value i.e. SGPR are idempotent.
// Reduced value will be same as given sgpr.
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
RetBB = &BB;
switch(Opc){
case AMDGPU::S_MIN_U32:
case AMDGPU::S_MIN_I32:
case AMDGPU::S_MAX_U32:
case AMDGPU::S_MAX_I32:
case AMDGPU::S_AND_B32:
case AMDGPU::S_OR_B32:{
// Idempotent operations.
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
RetBB = &BB;
break;
}
case AMDGPU::S_XOR_B32:
case AMDGPU::S_ADD_I32:
case AMDGPU::S_SUB_I32:{
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
Register CountOfActiveLanesReg = MRI.createVirtualRegister(DstRegClass);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just ActiveLanes ? In general, you are mangling type in your variable names everywhere. we can avoid that.


bool IsWave32 = ST.isWave32();
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
unsigned CountReg = IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;

// Create initail values of induction variable from Exec, Accumulator and
// insert branch instr to newly created ComputeBlock
auto Exec =
BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);

auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), CountOfActiveLanesReg)
.addReg(Exec->getOperand(0).getReg());

switch(Opc){
case AMDGPU::S_XOR_B32:{
// Performing an XOR operation on a uniform value
// depends on the parity of the number of active lanes.
// For even parity, the result will be 0, for odd
// parity the result will be the same as the input value.
Register ParityRegister = MRI.createVirtualRegister(DstRegClass);

auto ParityReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
.addReg(NewAccumulator->getOperand(0).getReg())
.addImm(1);
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
.addReg(SrcReg)
.addReg(ParityReg->getOperand(0).getReg()) ;
break;
}
case AMDGPU::S_SUB_I32:{
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);

// Take the negation of the source operand.
auto InvertedValReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
.addReg(InvertedValReg->getOperand(0).getReg())
.addReg(NewAccumulator->getOperand(0).getReg());
break;
}
case AMDGPU::S_ADD_I32:{
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
.addReg(SrcReg)
.addReg(NewAccumulator->getOperand(0).getReg());
break;
}
}
RetBB = &BB;
}
}
} else {
// TODO: Implement DPP Strategy and switch based on immediate strategy
// operand. For now, for all the cases (default, Iterative and DPP we use
Expand Down Expand Up @@ -4900,9 +4985,9 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;

// Create initail values of induction variable from Exec, Accumulator and
// insert branch instr to newly created ComputeBlockk
uint32_t InitalValue =
(Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
// insert branch instr to newly created ComputeBlock
uint32_t InitalValue = getInitialValueForWaveReduction(Opc);

auto TmpSReg =
BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
Expand Down Expand Up @@ -4970,8 +5055,26 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
switch (MI.getOpcode()) {
case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
case AMDGPU::WAVE_REDUCE_UADD_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
case AMDGPU::WAVE_REDUCE_USUB_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
case AMDGPU::S_UADDO_PSEUDO:
case AMDGPU::S_USUBO_PSEUDO: {
const DebugLoc &DL = MI.getDebugLoc();
Expand Down
35 changes: 27 additions & 8 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -254,16 +254,35 @@ def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
(V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;

let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> {
defvar int_amdgcn_wave_reduce_ = "int_amdgcn_wave_reduce_";
multiclass
AMDGPUWaveReducePseudoGenerator<string Op, string DataType, string Size> {
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
def !toupper(Op) #"_PSEUDO_" #DataType #Size
: VPseudoInstSI<(outs SGPR_32
: $sdst),
(ins VSrc_b32
: $src, VSrc_b32
: $strategy),
[(set i32
: $sdst, (!cast<AMDGPUWaveReduce>(
int_amdgcn_wave_reduce_ #Op) i32
: $src, i32
: $strategy))]> {}
}
}

def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
}
// Input list : [Operation_name, Signed(I)/Unsigned(U)/Float(F)/Bitwise(B),
// Size_in_bits]
defvar Operations = [
["umin", "U", "32"], ["min", "I", "32"], ["umax", "U", "32"],
["max", "I", "32"], ["uadd", "U", "32"], ["add", "I", "32"],
["usub", "U", "32"], ["sub", "I", "32"], ["and", "B", "32"],
["or", "B", "32"], ["xor", "B", "32"]
];

foreach Op = Operations in {
defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op[0], Op[1], Op[2]>;
}

let usesCustomInserter = 1, Defs = [VCC] in {
Expand Down
Loading