Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,8 @@ BUILTIN(__builtin_r600_read_tidig_z, "Ui", "nc")
BUILTIN(__builtin_r600_recipsqrt_ieee, "dd", "nc")
BUILTIN(__builtin_r600_recipsqrt_ieeef, "ff", "nc")

BUILTIN(__builtin_amdgcn_wave_reduce_mask_max_i32, "iiii", "nc")

//===----------------------------------------------------------------------===//
// MFMA builtins.
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -620,5 +622,6 @@ TARGET_BUILTIN(__builtin_amdgcn_bitop3_b16, "ssssIUi", "nc", "bitop3-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf16_f32, "V2yV2yfUiIb", "nc", "f32-to-f16bf16-cvt-sr-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16bf16-cvt-sr-insts")


#undef BUILTIN
#undef TARGET_BUILTIN
18 changes: 18 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20053,6 +20053,15 @@ void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs));
}

static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) {
switch (BuiltinID) {
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_mask_max_i32:
return Intrinsic::amdgcn_wave_reduce_umax;
default:
llvm_unreachable("Unknown BuiltinID for wave reduction");
}
}

Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
const CallExpr *E) {
llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
Expand Down Expand Up @@ -20360,6 +20369,15 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
return EmitAMDGCNBallotForExec(*this, E, Int32Ty, Int32Ty, false);
case AMDGPU::BI__builtin_amdgcn_read_exec_hi:
return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, true);
case AMDGPU::BI__builtin_amdgcn_wave_reduce_mask_max_i32:{
Intrinsic::ID IID = getIntrinsicIDforWaveReduction(BuiltinID);
llvm::Value *Value = EmitScalarExpr(E->getArg(0));
llvm::Value *Mask = EmitScalarExpr(E->getArg(1));
llvm::Value *Strategy = EmitScalarExpr(E->getArg(2));
// llvm::errs() << "Value->getType():" << Value->getType() << "\n";
llvm::Function *F = CGM.getIntrinsic(IID, {Value->getType()});
return Builder.CreateCall(F, {Value, Mask, Strategy});
}
case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray:
case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_h:
case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_l:
Expand Down
5 changes: 3 additions & 2 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2316,12 +2316,13 @@ def int_amdgcn_s_wqm :
class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
[data_ty],
[
LLVMMatchType<0>, // llvm value to reduce (SGPR/VGPR)
LLVMMatchType<0>, // llvm value to reduce (SGPR/VGPR),
llvm_i32_ty, // Divergent mask
llvm_i32_ty // Reduction Strategy Switch for lowering ( 0: Default,
// 1: Iterative strategy, and
// 2. DPP)
],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<2>>]>;

def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
Expand Down
171 changes: 103 additions & 68 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4970,114 +4970,149 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const DebugLoc &DL = MI.getDebugLoc();
const SIInstrInfo *TII = ST.getInstrInfo();

// const MachineFunction *MF = BB.getParent();
// const TargetRegisterInfo *TrgtRegInfo = MF->getSubtarget().getRegisterInfo();
// Reduction operations depend on whether the input operand is SGPR or VGPR.
Register SrcReg = MI.getOperand(1).getReg();
bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
auto SrcRegClass = MRI.getRegClass(SrcReg);
// llvm::errs() << TrgtRegInfo->getRegClassName(SrcRegClass) << "\n";
bool isSGPR = TRI->isSGPRClass(SrcRegClass);
Register DstReg = MI.getOperand(0).getReg();
// llvm::errs() << TrgtRegInfo->getRegClassName(MRI.getRegClass(DstReg)) << "\n";
Register DivergentMaskReg = MI.getOperand(2).getReg();
// llvm::errs() << TrgtRegInfo->getRegClassName(MRI.getRegClass(DivergentMaskReg)) << "\n";

MachineBasicBlock *RetBB = nullptr;
if (isSGPR) {
// These operations with a uniform value i.e. SGPR are idempotent.
// Reduced value will be same as given sgpr.
// clang-format off
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
.addReg(SrcReg);
// clang-format on
RetBB = &BB;
} else {
// TODO: Implement DPP Strategy and switch based on immediate strategy
// operand. For now, for all the cases (default, Iterative and DPP we use
// iterative approach by default.)

// To reduce the VGPR using iterative approach, we need to iterate
// over all the active lanes. Lowering consists of ComputeLoop,
// which iterate over only active lanes. We use copy of EXEC register
// as induction variable and every active lane modifies it using bitset0
// so that we will get the next active lane for next iteration.

MachineBasicBlock::iterator I = BB.end();
Register SrcReg = MI.getOperand(1).getReg();

// Create Control flow for loop
// Split MI's Machine Basic block into For loop
auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);

// Create virtual registers required for lowering.
auto SReg32XM0RegClass = &AMDGPU::SReg_32_XM0RegClass;
auto SReg32RegClass = &AMDGPU::SReg_32RegClass;

const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
Register InitalValReg = MRI.createVirtualRegister(DstRegClass);

Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);

Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
Register LaneValueReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
Register ExecCopyReg = MRI.createVirtualRegister(WaveMaskRegClass);
Register ExecCopyReg1 = MRI.createVirtualRegister(WaveMaskRegClass);
Register AccSGPRReg = MRI.createVirtualRegister(SReg32XM0RegClass);
Register UpdatedAccSGPRReg = MRI.createVirtualRegister(SReg32RegClass);
Register AccReg1 = MRI.createVirtualRegister(DstRegClass);
Register AccReg = MRI.createVirtualRegister(DstRegClass);
Register BPermAddrReg = MRI.createVirtualRegister(DstRegClass);
Register UpdatedBPermAddrReg = MRI.createVirtualRegister(DstRegClass);
Register InitialBPermAddrReg = MRI.createVirtualRegister(DstRegClass);
Register UpdatedAccReg = MRI.createVirtualRegister(DstRegClass);
Register ActiveLanesReg = MRI.createVirtualRegister(WaveMaskRegClass);
Register UpdatedActiveLanesReg = MRI.createVirtualRegister(WaveMaskRegClass);
Register FF1ActiveLanesReg = MRI.createVirtualRegister(SReg32RegClass);
Register FF1MaskReg = MRI.createVirtualRegister(SReg32RegClass);
Register FF1MaskX4Reg = MRI.createVirtualRegister(SReg32RegClass);
Register ValReg = MRI.createVirtualRegister(SReg32XM0RegClass);
Register MaskReg = MRI.createVirtualRegister(SReg32XM0RegClass);

bool IsWave32 = ST.isWave32();
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;

// Create initail values of induction variable from Exec, Accumulator and
// insert branch instr to newly created ComputeBlockk
uint32_t InitalValue =
uint32_t IdentityValue =
(Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
auto TmpSReg =
BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
.addImm(InitalValue);
// clang-format off

BuildMI(BB, I, DL, TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), ExecCopyReg).addReg(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC); //%19:sreg_64_xexec = S_MOV_B64 $exec

BuildMI(BB, I, DL, TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), ExecCopyReg1).addReg(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC); //%19:sreg_64_xexec = S_MOV_B64 $exec

BuildMI(BB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), AccReg)
.addImm(IdentityValue);// %24:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
BuildMI(BB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), InitialBPermAddrReg)
.addImm(0);
BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
.addMBB(ComputeLoop);
// clang-format on

// Start constructing ComputeLoop
I = ComputeLoop->end();
auto Accumulator =
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
.addReg(InitalValReg)
.addMBB(&BB);
auto ActiveBits =
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
.addReg(TmpSReg->getOperand(0).getReg())
.addMBB(&BB);

auto PhiActiveLanesInst =
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveLanesReg)
.addReg(ExecCopyReg)
.addMBB(&BB);// %25:sreg_64_xexec = PHI %19:sreg_64_xexec, %bb.0, %26:sreg_64_xexec, %bb.1
auto PhiAccInst =
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccReg1)
.addReg(AccReg)
.addMBB(&BB);//%23:vgpr_32 = PHI %24:vgpr_32, %bb.0, %22:vgpr_32, %bb.1
auto PhiBPermAddrInst =
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), BPermAddrReg)
.addReg(InitialBPermAddrReg)
.addMBB(&BB);//%23:vgpr_32 = PHI %24:vgpr_32, %bb.0, %22:vgpr_32, %bb.1
// Perform the computations
unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
.addReg(ActiveBits->getOperand(0).getReg());
auto LaneValue = BuildMI(*ComputeLoop, I, DL,
TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
BuildMI(*ComputeLoop, I, DL, TII->get(IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64), FF1ActiveLanesReg)
.addReg(ActiveLanesReg);//%27:sreg_32 = S_FF1_I32_B64 %25:sreg_64_xexec

BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), ValReg)
.addReg(SrcReg)
.addReg(FF1->getOperand(0).getReg());
auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
.addReg(Accumulator->getOperand(0).getReg())
.addReg(LaneValue->getOperand(0).getReg());
.addReg(FF1ActiveLanesReg);//%29:sreg_32_xm0 = V_READLANE_B32 %10:vgpr_32, %27:sreg_32

BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), MaskReg)
.addReg(DivergentMaskReg)
.addReg(FF1ActiveLanesReg);

BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_FF1_I32_B32), FF1MaskReg).addReg(MaskReg);

BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), AccSGPRReg)
.addReg(AccReg1)
.addReg(FF1MaskReg);

BuildMI(*ComputeLoop, I, DL, TII->get(Opc), UpdatedAccSGPRReg).addReg(AccSGPRReg).addReg(ValReg);

BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
.addReg(FF1MaskReg);

BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_WRITELANE_B32), UpdatedAccReg)
.addReg(UpdatedAccSGPRReg)
.addReg(AMDGPU::M0)
.addReg(AccReg1);

BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_LSHL_B32), FF1MaskX4Reg)
.addReg(FF1MaskReg)
.addImm(2);

BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
.addReg(FF1ActiveLanesReg);

BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_WRITELANE_B32), UpdatedBPermAddrReg)
.addReg(FF1MaskX4Reg)
.addReg(AMDGPU::M0)
.addReg(BPermAddrReg);

// Manipulate the iterator to get the next active lane
unsigned BITSETOpc =
IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
auto NewActiveBits =
BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
.addReg(FF1->getOperand(0).getReg())
.addReg(ActiveBits->getOperand(0).getReg());
BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), UpdatedActiveLanesReg)
.addReg(FF1ActiveLanesReg)
.addReg(ActiveLanesReg);

// Add phi nodes
Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
PhiActiveLanesInst.addReg(UpdatedActiveLanesReg)
.addMBB(ComputeLoop);
ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
PhiAccInst.addReg(UpdatedAccReg)
.addMBB(ComputeLoop);
PhiBPermAddrInst.addReg(UpdatedBPermAddrReg)
.addMBB(ComputeLoop);

// Creating branching
unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
.addReg(NewActiveBits->getOperand(0).getReg())
.addReg(UpdatedActiveLanesReg)
.addImm(0);
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
.addMBB(ComputeLoop);

BuildMI(*ComputeEnd, ComputeEnd->begin(), DL, TII->get(AMDGPU::DS_BPERMUTE_B32), DstReg)
.addReg(UpdatedBPermAddrReg)
.addReg(UpdatedAccReg)
.addImm(0);

RetBB = ComputeEnd;

}
MI.eraseFromParent();
return RetBB;
Expand Down
12 changes: 6 additions & 6 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -298,14 +298,14 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
(V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;

let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> {
def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs VGPR_32:$vdst),
(ins VSrc_b32: $src, VSrc_b32: $mask, VSrc_b32:$strategy),
[(set i32:$vdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$mask, i32:$strategy))]> {
}

def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs VGPR_32:$vdst),
(ins VSrc_b32: $src, VSrc_b32: $mask, VSrc_b32:$strategy),
[(set i32:$vdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$mask, i32:$strategy))]> {
}
}

Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s

declare i32 @llvm.amdgcn.wave.reduce.umax.i32(i32, i32 immarg)
declare i32 @llvm.amdgcn.wave.reduce.umax.i32(i32, i32, i32 immarg)
declare i32 @llvm.amdgcn.workitem.id.x()

define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
Expand Down Expand Up @@ -122,12 +122,12 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1132GISEL-NEXT: s_endpgm
entry:
%result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 1)
%result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 15, i32 1)
store i32 %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
define amdgpu_kernel void @const_value(ptr addrspace(1) %out, i32 %in) {
; GFX8DAGISEL-LABEL: const_value:
; GFX8DAGISEL: ; %bb.0: ; %entry
; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
Expand Down Expand Up @@ -218,7 +218,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1132GISEL-NEXT: s_endpgm
entry:
%result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 123, i32 1)
%result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 123, i32 %in, i32 1)
store i32 %result, ptr addrspace(1) %out
ret void
}
Expand Down Expand Up @@ -256,7 +256,7 @@ define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
; GFX11GISEL: ; %bb.0: ; %entry
; GFX11GISEL-NEXT: s_endpgm
entry:
%result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 poison, i32 1)
%result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 poison, i32 poison, i32 1)
store i32 %result, ptr addrspace(1) %out
ret void
}
Expand Down Expand Up @@ -499,7 +499,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_endpgm
entry:
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
%result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %id.x, i32 1)
%result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %id.x, i32 %in, i32 1)
store i32 %result, ptr addrspace(1) %out
ret void
}
Expand Down Expand Up @@ -937,11 +937,11 @@ entry:
br i1 %d_cmp, label %if, label %else

if:
%reducedValTid = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %tid, i32 1)
%reducedValTid = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %tid, i32 %in, i32 1)
br label %endif

else:
%reducedValIn = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 1)
%reducedValIn = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 %in, i32 1)
br label %endif

endif:
Expand Down