Skip to content

Commit c5c1cc5

Browse files
committed
reduce over divergent wave
1 parent 78408fd commit c5c1cc5

File tree

5 files changed

+62
-27
lines changed

5 files changed

+62
-27
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2313,15 +2313,16 @@ def int_amdgcn_s_quadmask :
23132313
def int_amdgcn_s_wqm :
23142314
DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>;
23152315

2316-
class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
2316+
class AMDGPUWaveReduce<LLVMType data_ty = llvm_i32_ty> : Intrinsic<
23172317
[data_ty],
23182318
[
2319-
LLVMMatchType<0>, // llvm value to reduce (SGPR/VGPR)
2319+
llvm_i32_ty, // llvm value to reduce (SGPR/VGPR),
2320+
llvm_i64_ty, // Divergent mask
23202321
llvm_i32_ty // Reduction Strategy Switch for lowering ( 0: Default,
23212322
// 1: Iterative strategy, and
23222323
// 2. DPP)
23232324
],
2324-
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
2325+
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<2>>]>;
23252326

23262327
def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
23272328
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4970,11 +4970,21 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
49704970
const SIRegisterInfo *TRI = ST.getRegisterInfo();
49714971
const DebugLoc &DL = MI.getDebugLoc();
49724972
const SIInstrInfo *TII = ST.getInstrInfo();
4973-
4973+
const MachineFunction *MF = BB.getParent();
4974+
const TargetRegisterInfo *TrgtRegInfo = MF->getSubtarget().getRegisterInfo();
49744975
// Reduction operations depend on whether the input operand is SGPR or VGPR.
49754976
Register SrcReg = MI.getOperand(1).getReg();
4976-
bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4977+
auto SrcRegClass = MRI.getRegClass(SrcReg);
4978+
llvm::errs() << TrgtRegInfo->getRegClassName(SrcRegClass) << "\n";
4979+
bool isSGPR = TRI->isSGPRClass(SrcRegClass);
49774980
Register DstReg = MI.getOperand(0).getReg();
4981+
llvm::errs() << TrgtRegInfo->getRegClassName(MRI.getRegClass(DstReg)) << "\n";
4982+
Register MaskReg = MI.getOperand(2).getReg();
4983+
llvm::errs() << TrgtRegInfo->getRegClassName(MRI.getRegClass(MaskReg)) << "\n";
4984+
4985+
// llvm::errs() << "srcreg:" << MRI.getRegClassName(MRI.getRegClass(SrcReg)) << "\n";
4986+
// llvm::errs() << "DstReg:" << MRI.getRegClassName(MRI.getRegClass(DstReg)) << "\n";
4987+
// llvm::errs() << "MaskReg:" << MRI.getRegClassName(MRI.getRegClass(MaskReg)) << "\n";
49784988
MachineBasicBlock *RetBB = nullptr;
49794989
if (isSGPR) {
49804990
// These operations with a uniform value i.e. SGPR are idempotent.
@@ -5005,15 +5015,19 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
50055015
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
50065016
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
50075017
Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5008-
Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
5018+
Register InitalValReg = MRI.createVirtualRegister(DstRegClass);//MRI.getRegClass(SrcReg)
50095019

5010-
Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5020+
Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);//MRI.getRegClass(SrcReg)
50115021
Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
50125022
Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5023+
Register TempRegMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
50135024

50145025
Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
5026+
Register FF1MaskReg = MRI.createVirtualRegister(DstRegClass);
50155027
Register LaneValueReg =
50165028
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5029+
Register MaskLaneValueReg =
5030+
MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
50175031

50185032
bool IsWave32 = ST.isWave32();
50195033
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
@@ -5024,9 +5038,11 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
50245038
uint32_t InitalValue =
50255039
(Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
50265040
auto TmpSReg =
5027-
BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
5041+
BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg); //s_mov_b64 s[2:3], exec
5042+
// auto TmpMaskSReg =
5043+
// BuildMI(BB, I, DL, TII->get(MovOpc), TempRegMaskReg).addReg(MaskReg); //s_mov_b64 s[2:3], exec
50285044
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
5029-
.addImm(InitalValue);
5045+
.addImm(InitalValue);//s_mov_b32 s4, 0 | %17:sgpr_32 = S_MOV_B32 0
50305046
// clang-format off
50315047
BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
50325048
.addMBB(ComputeLoop);
@@ -5046,22 +5062,28 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
50465062
// Perform the computations
50475063
unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
50485064
auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5049-
.addReg(ActiveBits->getOperand(0).getReg());
5065+
.addReg(ActiveBits->getOperand(0).getReg());//%index.sgpr = S_FF1_I32_B64 %exec_copy.sreg
50505066
auto LaneValue = BuildMI(*ComputeLoop, I, DL,
50515067
TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
50525068
.addReg(SrcReg)
5053-
.addReg(FF1->getOperand(0).getReg());
5069+
.addReg(FF1->getOperand(0).getReg());//%value_at_lane_index.sreg = V_READLANE %value.vgpr %index.sgpr
5070+
auto MaskLaneValue = BuildMI(*ComputeLoop, I, DL,
5071+
TII->get(AMDGPU::V_READLANE_B32), MaskLaneValueReg)
5072+
.addReg(MaskReg)
5073+
.addReg(FF1->getOperand(0).getReg());//%mask_at_lane_index.sreg = V_READLANE %mask.vgpr %index.sgpr
5074+
auto FF2 = BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_FF1_I32_B64), FF1Reg)
5075+
.addReg(MaskLaneValue->getOperand(0).getReg());//%subgroupindex.sgpr = S_FF1_I32_B64 %mask_at_lane_index.sreg
50545076
auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
50555077
.addReg(Accumulator->getOperand(0).getReg())
5056-
.addReg(LaneValue->getOperand(0).getReg());
5078+
.addReg(LaneValue->getOperand(0).getReg());//%acc.sgpr = max %acc.sgpr %value_at_lane_index.sreg
50575079

50585080
// Manipulate the iterator to get the next active lane
50595081
unsigned BITSETOpc =
50605082
IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
50615083
auto NewActiveBits =
50625084
BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
50635085
.addReg(FF1->getOperand(0).getReg())
5064-
.addReg(ActiveBits->getOperand(0).getReg());
5086+
.addReg(ActiveBits->getOperand(0).getReg());//%bitsetresult = S_BITSET0_B64 %exec_copy
50655087

50665088
// Add phi nodes
50675089
Accumulator.addReg(NewAccumulator->getOperand(0).getReg())

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -298,14 +298,14 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
298298
(V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
299299

300300
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
301-
def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
302-
(ins VSrc_b32: $src, VSrc_b32:$strategy),
303-
[(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> {
301+
def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs VGPR_32:$sdst),
302+
(ins VSrc_b32: $src, VSrc_b64: $mask, VSrc_b32:$strategy),
303+
[(i32 (int_amdgcn_wave_reduce_umin i32:$src, i64:$mask, i32:$strategy))]> {
304304
}
305305

306-
def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
307-
(ins VSrc_b32: $src, VSrc_b32:$strategy),
308-
[(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
306+
def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs VGPR_32:$sdst),
307+
(ins VSrc_b32: $src, VSrc_b64: $mask, VSrc_b32:$strategy),
308+
[(i32 (int_amdgcn_wave_reduce_umax i32:$src, i64:$mask, i32:$strategy))]> {
309309
}
310310
}
311311

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
1313
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
1414

15-
declare i32 @llvm.amdgcn.wave.reduce.umax.i32(i32, i32 immarg)
15+
declare i32 @llvm.amdgcn.wave.reduce.umax.i32(i32, i32, i32 immarg)
1616
declare i32 @llvm.amdgcn.workitem.id.x()
1717

1818
define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
@@ -122,12 +122,12 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
122122
; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
123123
; GFX1132GISEL-NEXT: s_endpgm
124124
entry:
125-
%result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 1)
125+
%result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 15, i32 1)
126126
store i32 %result, ptr addrspace(1) %out
127127
ret void
128128
}
129129

130-
define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
130+
define amdgpu_kernel void @const_value(ptr addrspace(1) %out, i32 %in) {
131131
; GFX8DAGISEL-LABEL: const_value:
132132
; GFX8DAGISEL: ; %bb.0: ; %entry
133133
; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -218,7 +218,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
218218
; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
219219
; GFX1132GISEL-NEXT: s_endpgm
220220
entry:
221-
%result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 123, i32 1)
221+
%result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 123, i32 %in, i32 1)
222222
store i32 %result, ptr addrspace(1) %out
223223
ret void
224224
}
@@ -256,7 +256,7 @@ define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
256256
; GFX11GISEL: ; %bb.0: ; %entry
257257
; GFX11GISEL-NEXT: s_endpgm
258258
entry:
259-
%result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 poison, i32 1)
259+
%result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 poison, i32 poison, i32 1)
260260
store i32 %result, ptr addrspace(1) %out
261261
ret void
262262
}
@@ -499,7 +499,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
499499
; GFX1132GISEL-NEXT: s_endpgm
500500
entry:
501501
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
502-
%result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %id.x, i32 1)
502+
%result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %id.x, i32 %in, i32 1)
503503
store i32 %result, ptr addrspace(1) %out
504504
ret void
505505
}
@@ -937,11 +937,11 @@ entry:
937937
br i1 %d_cmp, label %if, label %else
938938

939939
if:
940-
%reducedValTid = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %tid, i32 1)
940+
%reducedValTid = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %tid, i32 %in, i32 1)
941941
br label %endif
942942

943943
else:
944-
%reducedValIn = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 1)
944+
%reducedValIn = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 %in, i32 1)
945945
br label %endif
946946

947947
endif:

newreduceumax.ll

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, ptr addrspace(1) %maskarr, i32 %in) {
2+
3+
entry:
4+
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
5+
%mask_ptr = getelementptr inbounds i32, ptr addrspace(1) %maskarr, i32 %id.x
6+
; %mask_ptr_casted = bitcast ptr addrspace(1) %mask_ptr to ptr
7+
%mask = load i32, ptr addrspace(1) %mask_ptr
8+
%result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %id.x, i32 %mask, i32 1)
9+
store i32 %result, ptr addrspace(1) %out
10+
ret void
11+
12+
}

0 commit comments

Comments
 (0)