Skip to content

Commit 16e26e7

Browse files
committed
Wave Reduce Intrinsics for Integer Type(32 bit) -> Operations: Add, Sub, Min, Max, AND, OR, XOR
1 parent c808e66 commit 16e26e7

21 files changed

+8874
-24
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2109,7 +2109,7 @@ def int_amdgcn_s_quadmask :
21092109
def int_amdgcn_s_wqm :
21102110
DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>;
21112111

2112-
class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
2112+
class AMDGPUWaveReduce<LLVMType data_ty = llvm_any_ty> : Intrinsic<
21132113
[data_ty],
21142114
[
21152115
LLVMMatchType<0>, // llvm value to reduce (SGPR/VGPR)
@@ -2119,8 +2119,14 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
21192119
],
21202120
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
21212121

2122-
def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
2123-
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
2122+
multiclass AMDGPUWaveReduceGenerator<list<string> Operations> {
2123+
foreach Op = Operations in {
2124+
def Op : AMDGPUWaveReduce;
2125+
}
2126+
}
2127+
2128+
defvar Operations = ["umin", "min", "umax", "max", "add", "sub", "and", "or", "xor"];
2129+
defm int_amdgcn_wave_reduce_ : AMDGPUWaveReduceGenerator<Operations>;
21242130

21252131
def int_amdgcn_readfirstlane :
21262132
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4846,8 +4846,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
48464846
OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
48474847
break;
48484848
}
4849-
case Intrinsic::amdgcn_wave_reduce_umin:
4850-
case Intrinsic::amdgcn_wave_reduce_umax: {
4849+
case Intrinsic::amdgcn_wave_reduce_add:
4850+
case Intrinsic::amdgcn_wave_reduce_sub:
4851+
case Intrinsic::amdgcn_wave_reduce_min:
4852+
case Intrinsic::amdgcn_wave_reduce_umin:
4853+
case Intrinsic::amdgcn_wave_reduce_max:
4854+
case Intrinsic::amdgcn_wave_reduce_umax:
4855+
case Intrinsic::amdgcn_wave_reduce_and:
4856+
case Intrinsic::amdgcn_wave_reduce_or:
4857+
case Intrinsic::amdgcn_wave_reduce_xor: {
48514858
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
48524859
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
48534860
unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 115 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4861,10 +4861,80 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
48614861
Register DstReg = MI.getOperand(0).getReg();
48624862
MachineBasicBlock *RetBB = nullptr;
48634863
if (isSGPR) {
4864-
// These operations with a uniform value i.e. SGPR are idempotent.
4865-
// Reduced value will be same as given sgpr.
4866-
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4867-
RetBB = &BB;
4864+
switch(Opc){
4865+
case AMDGPU::S_MIN_U32:
4866+
case AMDGPU::S_MIN_I32:
4867+
case AMDGPU::S_MAX_U32:
4868+
case AMDGPU::S_MAX_I32:
4869+
case AMDGPU::S_AND_B32:
4870+
case AMDGPU::S_OR_B32:{
4871+
// These operations with a uniform value i.e. SGPR are idempotent.
4872+
// Reduced value will be same as given sgpr.
4873+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4874+
RetBB = &BB;
4875+
break;
4876+
}
4877+
case AMDGPU::S_XOR_B32:
4878+
case AMDGPU::S_ADD_I32:
4879+
case AMDGPU::S_SUB_I32:{
4880+
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4881+
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4882+
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
4883+
Register CountOfActiveLanesReg = MRI.createVirtualRegister(DstRegClass);
4884+
4885+
bool IsWave32 = ST.isWave32();
4886+
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4887+
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4888+
unsigned CountReg = IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
4889+
4890+
// Create initail values of induction variable from Exec, Accumulator and
4891+
// insert branch instr to newly created ComputeBlock
4892+
auto Exec =
4893+
BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
4894+
4895+
auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), CountOfActiveLanesReg)
4896+
.addReg(Exec->getOperand(0).getReg());
4897+
4898+
switch(Opc){
4899+
case AMDGPU::S_XOR_B32:{
4900+
// Performing an XOR operation on a uniform value
4901+
// depends on the number of active lanes. If there
4902+
// are an even number of active lanes, then the XOR
4903+
// will result in 0. And if there are an odd number
4904+
// of Active lanes then the XOR will result in the
4905+
// same value as that in the SGPR. This comes from
4906+
// the fact that A^A = 0 and A^0 = A.
4907+
4908+
Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
4909+
4910+
auto ParityReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
4911+
.addReg(NewAccumulator->getOperand(0).getReg())
4912+
.addImm(1);
4913+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
4914+
.addReg(SrcReg)
4915+
.addReg(ParityReg->getOperand(0).getReg()) ;
4916+
break;
4917+
}
4918+
case AMDGPU::S_SUB_I32:{
4919+
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
4920+
4921+
// Take the negation of the source operand.
4922+
auto InvertedValReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
4923+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
4924+
.addReg(InvertedValReg->getOperand(0).getReg())
4925+
.addReg(NewAccumulator->getOperand(0).getReg());
4926+
break;
4927+
}
4928+
case AMDGPU::S_ADD_I32:{
4929+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
4930+
.addReg(SrcReg)
4931+
.addReg(NewAccumulator->getOperand(0).getReg());
4932+
break;
4933+
}
4934+
}
4935+
RetBB = &BB;
4936+
}
4937+
}
48684938
} else {
48694939
// TODO: Implement DPP Strategy and switch based on immediate strategy
48704940
// operand. For now, for all the cases (default, Iterative and DPP we use
@@ -4900,9 +4970,30 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
49004970
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
49014971

49024972
// Create initail values of induction variable from Exec, Accumulator and
4903-
// insert branch instr to newly created ComputeBlockk
4904-
uint32_t InitalValue =
4905-
(Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4973+
// insert branch instr to newly created ComputeBlock
4974+
uint32_t InitalValue;
4975+
switch(Opc){
4976+
case AMDGPU::S_MIN_U32:
4977+
InitalValue = std::numeric_limits<uint32_t>::max();
4978+
break;
4979+
case AMDGPU::S_MIN_I32:
4980+
InitalValue = std::numeric_limits<int32_t>::max();
4981+
break;
4982+
case AMDGPU::S_MAX_U32:
4983+
InitalValue = 0;
4984+
break;
4985+
case AMDGPU::S_MAX_I32:
4986+
InitalValue = std::numeric_limits<int32_t>::min();
4987+
break;
4988+
case AMDGPU::S_ADD_I32:
4989+
case AMDGPU::S_SUB_I32:
4990+
case AMDGPU::S_OR_B32:
4991+
case AMDGPU::S_XOR_B32:
4992+
InitalValue = 0x00000000;
4993+
break;
4994+
case AMDGPU::S_AND_B32:
4995+
InitalValue = 0xFFFFFFFF;
4996+
}
49064997
auto TmpSReg =
49074998
BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
49084999
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
@@ -4968,10 +5059,24 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
49685059
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
49695060

49705061
switch (MI.getOpcode()) {
4971-
case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5062+
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_U32:
49725063
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
4973-
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5064+
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5065+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5066+
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_U32:
49745067
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5068+
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5069+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5070+
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5071+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5072+
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5073+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5074+
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5075+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5076+
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5077+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5078+
case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5079+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
49755080
case AMDGPU::S_UADDO_PSEUDO:
49765081
case AMDGPU::S_USUBO_PSEUDO: {
49775082
const DebugLoc &DL = MI.getDebugLoc();
@@ -6859,7 +6964,7 @@ SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
68596964

68606965
SDValue Op0 = Op.getOperand(0);
68616966
SDValue Op1 = Op.getOperand(1);
6862-
// If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
6967+
// If all the operands are zero-extended to 32-bits, then we replace s_mul_u64
68636968
// with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
68646969
// 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
68656970
KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -255,15 +255,50 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
255255
(V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
256256

257257
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
258-
def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
258+
def WAVE_REDUCE_MIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
259259
(ins VSrc_b32: $src, VSrc_b32:$strategy),
260260
[(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> {
261261
}
262262

263-
def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
263+
def WAVE_REDUCE_MIN_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
264+
(ins VSrc_b32: $src, VSrc_b32:$strategy),
265+
[(set i32:$sdst, (int_amdgcn_wave_reduce_min i32:$src, i32:$strategy))]> {
266+
}
267+
268+
def WAVE_REDUCE_MAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
264269
(ins VSrc_b32: $src, VSrc_b32:$strategy),
265270
[(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
266271
}
272+
273+
def WAVE_REDUCE_MAX_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
274+
(ins VSrc_b32: $src, VSrc_b32:$strategy),
275+
[(set i32:$sdst, (int_amdgcn_wave_reduce_max i32:$src, i32:$strategy))]> {
276+
}
277+
278+
def WAVE_REDUCE_ADD_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
279+
(ins VSrc_b32: $src, VSrc_b32:$strategy),
280+
[(set i32:$sdst, (int_amdgcn_wave_reduce_add i32:$src, i32:$strategy))]> {
281+
}
282+
283+
def WAVE_REDUCE_SUB_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
284+
(ins VSrc_b32: $src, VSrc_b32:$strategy),
285+
[(set i32:$sdst, (int_amdgcn_wave_reduce_sub i32:$src, i32:$strategy))]> {
286+
}
287+
288+
def WAVE_REDUCE_AND_PSEUDO_B32 : VPseudoInstSI <(outs SGPR_32:$sdst),
289+
(ins VSrc_b32: $src, VSrc_b32:$strategy),
290+
[(set i32:$sdst, (int_amdgcn_wave_reduce_and i32:$src, i32:$strategy))]> {
291+
}
292+
293+
def WAVE_REDUCE_OR_PSEUDO_B32 : VPseudoInstSI <(outs SGPR_32:$sdst),
294+
(ins VSrc_b32: $src, VSrc_b32:$strategy),
295+
[(set i32:$sdst, (int_amdgcn_wave_reduce_or i32:$src, i32:$strategy))]> {
296+
}
297+
298+
def WAVE_REDUCE_XOR_PSEUDO_B32 : VPseudoInstSI <(outs SGPR_32:$sdst),
299+
(ins VSrc_b32: $src, VSrc_b32:$strategy),
300+
[(set i32:$sdst, (int_amdgcn_wave_reduce_xor i32:$src, i32:$strategy))]> {
301+
}
267302
}
268303

269304
let usesCustomInserter = 1, Defs = [VCC] in {

llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_value(ptr addrspace(1) %ptr) #
132132
; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP6]], 0
133133
; IR-ITERATIVE-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP14:%.*]]
134134
; IR-ITERATIVE: 12:
135-
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4
135+
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4
136136
; IR-ITERATIVE-NEXT: br label [[TMP14]]
137137
; IR-ITERATIVE: 14:
138138
; IR-ITERATIVE-NEXT: ret void
@@ -151,12 +151,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_value(ptr addrspace(1) %ptr) #
151151
; IR-DPP-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP6]], 0
152152
; IR-DPP-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP14:%.*]]
153153
; IR-DPP: 12:
154-
; IR-DPP-NEXT: [[TMP13:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4
154+
; IR-DPP-NEXT: [[TMP13:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4
155155
; IR-DPP-NEXT: br label [[TMP14]]
156156
; IR-DPP: 14:
157157
; IR-DPP-NEXT: ret void
158158
;
159-
%result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 seq_cst
159+
%result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 seq_cst
160160
ret void
161161
}
162162

0 commit comments

Comments
 (0)