Skip to content

Commit c8989dc

Browse files
committed
reduce wrt divergent mask
1 parent 78408fd commit c8989dc

File tree

6 files changed

+141
-84
lines changed

6 files changed

+141
-84
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,8 @@ BUILTIN(__builtin_r600_read_tidig_z, "Ui", "nc")
363363
BUILTIN(__builtin_r600_recipsqrt_ieee, "dd", "nc")
364364
BUILTIN(__builtin_r600_recipsqrt_ieeef, "ff", "nc")
365365

366+
BUILTIN(__builtin_amdgcn_wave_reduce_mask_max_i32, "iiii", "nc")
367+
366368
//===----------------------------------------------------------------------===//
367369
// MFMA builtins.
368370
//===----------------------------------------------------------------------===//
@@ -620,5 +622,6 @@ TARGET_BUILTIN(__builtin_amdgcn_bitop3_b16, "ssssIUi", "nc", "bitop3-insts")
620622
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf16_f32, "V2yV2yfUiIb", "nc", "f32-to-f16bf16-cvt-sr-insts")
621623
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16bf16-cvt-sr-insts")
622624

625+
623626
#undef BUILTIN
624627
#undef TARGET_BUILTIN

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20053,6 +20053,15 @@ void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
2005320053
Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs));
2005420054
}
2005520055

20056+
static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) {
20057+
switch (BuiltinID) {
20058+
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_mask_max_i32:
20059+
return Intrinsic::amdgcn_wave_reduce_umax;
20060+
default:
20061+
llvm_unreachable("Unknown BuiltinID for wave reduction");
20062+
}
20063+
}
20064+
2005620065
Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
2005720066
const CallExpr *E) {
2005820067
llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
@@ -20360,6 +20369,15 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
2036020369
return EmitAMDGCNBallotForExec(*this, E, Int32Ty, Int32Ty, false);
2036120370
case AMDGPU::BI__builtin_amdgcn_read_exec_hi:
2036220371
return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, true);
20372+
case AMDGPU::BI__builtin_amdgcn_wave_reduce_mask_max_i32:{
20373+
Intrinsic::ID IID = getIntrinsicIDforWaveReduction(BuiltinID);
20374+
llvm::Value *Value = EmitScalarExpr(E->getArg(0));
20375+
llvm::Value *Mask = EmitScalarExpr(E->getArg(1));
20376+
llvm::Value *Strategy = EmitScalarExpr(E->getArg(2));
20377+
// llvm::errs() << "Value->getType():" << Value->getType() << "\n";
20378+
llvm::Function *F = CGM.getIntrinsic(IID, {Value->getType()});
20379+
return Builder.CreateCall(F, {Value, Mask, Strategy});
20380+
}
2036320381
case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray:
2036420382
case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_h:
2036520383
case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_l:

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2316,12 +2316,13 @@ def int_amdgcn_s_wqm :
23162316
class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
23172317
[data_ty],
23182318
[
2319-
LLVMMatchType<0>, // llvm value to reduce (SGPR/VGPR)
2319+
LLVMMatchType<0>, // llvm value to reduce (SGPR/VGPR),
2320+
llvm_i32_ty, // Divergent mask
23202321
llvm_i32_ty // Reduction Strategy Switch for lowering ( 0: Default,
23212322
// 1: Iterative strategy, and
23222323
// 2. DPP)
23232324
],
2324-
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
2325+
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<2>>]>;
23252326

23262327
def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
23272328
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 103 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -4970,114 +4970,149 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
49704970
const SIRegisterInfo *TRI = ST.getRegisterInfo();
49714971
const DebugLoc &DL = MI.getDebugLoc();
49724972
const SIInstrInfo *TII = ST.getInstrInfo();
4973-
4973+
// const MachineFunction *MF = BB.getParent();
4974+
// const TargetRegisterInfo *TrgtRegInfo = MF->getSubtarget().getRegisterInfo();
49744975
// Reduction operations depend on whether the input operand is SGPR or VGPR.
49754976
Register SrcReg = MI.getOperand(1).getReg();
4976-
bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4977+
auto SrcRegClass = MRI.getRegClass(SrcReg);
4978+
// llvm::errs() << TrgtRegInfo->getRegClassName(SrcRegClass) << "\n";
4979+
bool isSGPR = TRI->isSGPRClass(SrcRegClass);
49774980
Register DstReg = MI.getOperand(0).getReg();
4981+
// llvm::errs() << TrgtRegInfo->getRegClassName(MRI.getRegClass(DstReg)) << "\n";
4982+
Register DivergentMaskReg = MI.getOperand(2).getReg();
4983+
// llvm::errs() << TrgtRegInfo->getRegClassName(MRI.getRegClass(DivergentMaskReg)) << "\n";
4984+
49784985
MachineBasicBlock *RetBB = nullptr;
49794986
if (isSGPR) {
4980-
// These operations with a uniform value i.e. SGPR are idempotent.
4981-
// Reduced value will be same as given sgpr.
4982-
// clang-format off
49834987
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
49844988
.addReg(SrcReg);
4985-
// clang-format on
49864989
RetBB = &BB;
49874990
} else {
4988-
// TODO: Implement DPP Strategy and switch based on immediate strategy
4989-
// operand. For now, for all the cases (default, Iterative and DPP we use
4990-
// iterative approach by default.)
4991-
4992-
// To reduce the VGPR using iterative approach, we need to iterate
4993-
// over all the active lanes. Lowering consists of ComputeLoop,
4994-
// which iterate over only active lanes. We use copy of EXEC register
4995-
// as induction variable and every active lane modifies it using bitset0
4996-
// so that we will get the next active lane for next iteration.
4991+
49974992
MachineBasicBlock::iterator I = BB.end();
4998-
Register SrcReg = MI.getOperand(1).getReg();
49994993

5000-
// Create Control flow for loop
5001-
// Split MI's Machine Basic block into For loop
50024994
auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
50034995

5004-
// Create virtual registers required for lowering.
4996+
auto SReg32XM0RegClass = &AMDGPU::SReg_32_XM0RegClass;
4997+
auto SReg32RegClass = &AMDGPU::SReg_32RegClass;
4998+
50054999
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
50065000
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5007-
Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5008-
Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
5009-
5010-
Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5011-
Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5012-
Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5013-
5014-
Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
5015-
Register LaneValueReg =
5016-
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5001+
Register ExecCopyReg = MRI.createVirtualRegister(WaveMaskRegClass);
5002+
Register ExecCopyReg1 = MRI.createVirtualRegister(WaveMaskRegClass);
5003+
Register AccSGPRReg = MRI.createVirtualRegister(SReg32XM0RegClass);
5004+
Register UpdatedAccSGPRReg = MRI.createVirtualRegister(SReg32RegClass);
5005+
Register AccReg1 = MRI.createVirtualRegister(DstRegClass);
5006+
Register AccReg = MRI.createVirtualRegister(DstRegClass);
5007+
Register BPermAddrReg = MRI.createVirtualRegister(DstRegClass);
5008+
Register UpdatedBPermAddrReg = MRI.createVirtualRegister(DstRegClass);
5009+
Register InitialBPermAddrReg = MRI.createVirtualRegister(DstRegClass);
5010+
Register UpdatedAccReg = MRI.createVirtualRegister(DstRegClass);
5011+
Register ActiveLanesReg = MRI.createVirtualRegister(WaveMaskRegClass);
5012+
Register UpdatedActiveLanesReg = MRI.createVirtualRegister(WaveMaskRegClass);
5013+
Register FF1ActiveLanesReg = MRI.createVirtualRegister(SReg32RegClass);
5014+
Register FF1MaskReg = MRI.createVirtualRegister(SReg32RegClass);
5015+
Register FF1MaskX4Reg = MRI.createVirtualRegister(SReg32RegClass);
5016+
Register ValReg = MRI.createVirtualRegister(SReg32XM0RegClass);
5017+
Register MaskReg = MRI.createVirtualRegister(SReg32XM0RegClass);
50175018

50185019
bool IsWave32 = ST.isWave32();
5019-
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5020-
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
50215020

5022-
// Create initail values of induction variable from Exec, Accumulator and
5023-
// insert branch instr to newly created ComputeBlockk
5024-
uint32_t InitalValue =
5021+
uint32_t IdentityValue =
50255022
(Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
5026-
auto TmpSReg =
5027-
BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
5028-
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
5029-
.addImm(InitalValue);
5030-
// clang-format off
5023+
5024+
BuildMI(BB, I, DL, TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), ExecCopyReg).addReg(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC); //%19:sreg_64_xexec = S_MOV_B64 $exec
5025+
5026+
BuildMI(BB, I, DL, TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), ExecCopyReg1).addReg(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC); //%19:sreg_64_xexec = S_MOV_B64 $exec
5027+
5028+
BuildMI(BB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), AccReg)
5029+
.addImm(IdentityValue);// %24:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
5030+
BuildMI(BB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), InitialBPermAddrReg)
5031+
.addImm(0);
50315032
BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
50325033
.addMBB(ComputeLoop);
5033-
// clang-format on
50345034

5035-
// Start constructing ComputeLoop
50365035
I = ComputeLoop->end();
5037-
auto Accumulator =
5038-
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5039-
.addReg(InitalValReg)
5040-
.addMBB(&BB);
5041-
auto ActiveBits =
5042-
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5043-
.addReg(TmpSReg->getOperand(0).getReg())
5044-
.addMBB(&BB);
50455036

5037+
auto PhiActiveLanesInst =
5038+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveLanesReg)
5039+
.addReg(ExecCopyReg)
5040+
.addMBB(&BB);// %25:sreg_64_xexec = PHI %19:sreg_64_xexec, %bb.0, %26:sreg_64_xexec, %bb.1
5041+
auto PhiAccInst =
5042+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccReg1)
5043+
.addReg(AccReg)
5044+
.addMBB(&BB);//%23:vgpr_32 = PHI %24:vgpr_32, %bb.0, %22:vgpr_32, %bb.1
5045+
auto PhiBPermAddrInst =
5046+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), BPermAddrReg)
5047+
.addReg(InitialBPermAddrReg)
5048+
.addMBB(&BB);//%23:vgpr_32 = PHI %24:vgpr_32, %bb.0, %22:vgpr_32, %bb.1
50465049
// Perform the computations
5047-
unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5048-
auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5049-
.addReg(ActiveBits->getOperand(0).getReg());
5050-
auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5051-
TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5050+
BuildMI(*ComputeLoop, I, DL, TII->get(IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64), FF1ActiveLanesReg)
5051+
.addReg(ActiveLanesReg);//%27:sreg_32 = S_FF1_I32_B64 %25:sreg_64_xexec
5052+
5053+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), ValReg)
50525054
.addReg(SrcReg)
5053-
.addReg(FF1->getOperand(0).getReg());
5054-
auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5055-
.addReg(Accumulator->getOperand(0).getReg())
5056-
.addReg(LaneValue->getOperand(0).getReg());
5055+
.addReg(FF1ActiveLanesReg);//%29:sreg_32_xm0 = V_READLANE_B32 %10:vgpr_32, %27:sreg_32
5056+
5057+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), MaskReg)
5058+
.addReg(DivergentMaskReg)
5059+
.addReg(FF1ActiveLanesReg);
5060+
5061+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_FF1_I32_B32), FF1MaskReg).addReg(MaskReg);
5062+
5063+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), AccSGPRReg)
5064+
.addReg(AccReg1)
5065+
.addReg(FF1MaskReg);
5066+
5067+
BuildMI(*ComputeLoop, I, DL, TII->get(Opc), UpdatedAccSGPRReg).addReg(AccSGPRReg).addReg(ValReg);
5068+
5069+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5070+
.addReg(FF1MaskReg);
5071+
5072+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_WRITELANE_B32), UpdatedAccReg)
5073+
.addReg(UpdatedAccSGPRReg)
5074+
.addReg(AMDGPU::M0)
5075+
.addReg(AccReg1);
5076+
5077+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_LSHL_B32), FF1MaskX4Reg)
5078+
.addReg(FF1MaskReg)
5079+
.addImm(2);
5080+
5081+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5082+
.addReg(FF1ActiveLanesReg);
5083+
5084+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_WRITELANE_B32), UpdatedBPermAddrReg)
5085+
.addReg(FF1MaskX4Reg)
5086+
.addReg(AMDGPU::M0)
5087+
.addReg(BPermAddrReg);
50575088

5058-
// Manipulate the iterator to get the next active lane
50595089
unsigned BITSETOpc =
50605090
IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5061-
auto NewActiveBits =
5062-
BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5063-
.addReg(FF1->getOperand(0).getReg())
5064-
.addReg(ActiveBits->getOperand(0).getReg());
5091+
BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), UpdatedActiveLanesReg)
5092+
.addReg(FF1ActiveLanesReg)
5093+
.addReg(ActiveLanesReg);
50655094

5066-
// Add phi nodes
5067-
Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5095+
PhiActiveLanesInst.addReg(UpdatedActiveLanesReg)
50685096
.addMBB(ComputeLoop);
5069-
ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5097+
PhiAccInst.addReg(UpdatedAccReg)
5098+
.addMBB(ComputeLoop);
5099+
PhiBPermAddrInst.addReg(UpdatedBPermAddrReg)
50705100
.addMBB(ComputeLoop);
50715101

5072-
// Creating branching
50735102
unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
50745103
BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5075-
.addReg(NewActiveBits->getOperand(0).getReg())
5104+
.addReg(UpdatedActiveLanesReg)
50765105
.addImm(0);
50775106
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
50785107
.addMBB(ComputeLoop);
50795108

5109+
BuildMI(*ComputeEnd, ComputeEnd->begin(), DL, TII->get(AMDGPU::DS_BPERMUTE_B32), DstReg)
5110+
.addReg(UpdatedBPermAddrReg)
5111+
.addReg(UpdatedAccReg)
5112+
.addImm(0);
5113+
50805114
RetBB = ComputeEnd;
5115+
50815116
}
50825117
MI.eraseFromParent();
50835118
return RetBB;

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -298,14 +298,14 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
298298
(V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
299299

300300
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
301-
def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
302-
(ins VSrc_b32: $src, VSrc_b32:$strategy),
303-
[(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> {
301+
def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs VGPR_32:$vdst),
302+
(ins VSrc_b32: $src, VSrc_b32: $mask, VSrc_b32:$strategy),
303+
[(set i32:$vdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$mask, i32:$strategy))]> {
304304
}
305305

306-
def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
307-
(ins VSrc_b32: $src, VSrc_b32:$strategy),
308-
[(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
306+
def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs VGPR_32:$vdst),
307+
(ins VSrc_b32: $src, VSrc_b32: $mask, VSrc_b32:$strategy),
308+
[(set i32:$vdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$mask, i32:$strategy))]> {
309309
}
310310
}
311311

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
1313
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
1414

15-
declare i32 @llvm.amdgcn.wave.reduce.umax.i32(i32, i32 immarg)
15+
declare i32 @llvm.amdgcn.wave.reduce.umax.i32(i32, i32, i32 immarg)
1616
declare i32 @llvm.amdgcn.workitem.id.x()
1717

1818
define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
@@ -122,12 +122,12 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
122122
; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
123123
; GFX1132GISEL-NEXT: s_endpgm
124124
entry:
125-
%result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 1)
125+
%result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 15, i32 1)
126126
store i32 %result, ptr addrspace(1) %out
127127
ret void
128128
}
129129

130-
define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
130+
define amdgpu_kernel void @const_value(ptr addrspace(1) %out, i32 %in) {
131131
; GFX8DAGISEL-LABEL: const_value:
132132
; GFX8DAGISEL: ; %bb.0: ; %entry
133133
; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -218,7 +218,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
218218
; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
219219
; GFX1132GISEL-NEXT: s_endpgm
220220
entry:
221-
%result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 123, i32 1)
221+
%result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 123, i32 %in, i32 1)
222222
store i32 %result, ptr addrspace(1) %out
223223
ret void
224224
}
@@ -256,7 +256,7 @@ define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
256256
; GFX11GISEL: ; %bb.0: ; %entry
257257
; GFX11GISEL-NEXT: s_endpgm
258258
entry:
259-
%result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 poison, i32 1)
259+
%result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 poison, i32 poison, i32 1)
260260
store i32 %result, ptr addrspace(1) %out
261261
ret void
262262
}
@@ -499,7 +499,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
499499
; GFX1132GISEL-NEXT: s_endpgm
500500
entry:
501501
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
502-
%result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %id.x, i32 1)
502+
%result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %id.x, i32 %in, i32 1)
503503
store i32 %result, ptr addrspace(1) %out
504504
ret void
505505
}
@@ -937,11 +937,11 @@ entry:
937937
br i1 %d_cmp, label %if, label %else
938938

939939
if:
940-
%reducedValTid = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %tid, i32 1)
940+
%reducedValTid = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %tid, i32 %in, i32 1)
941941
br label %endif
942942

943943
else:
944-
%reducedValIn = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 1)
944+
%reducedValIn = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 %in, i32 1)
945945
br label %endif
946946

947947
endif:

0 commit comments

Comments
 (0)