Skip to content

Commit dbf4525

Browse files
authored
[AMDGPU] Add wave reduce intrinsics for float types - 1 (#161814)
Supported Ops: `fmin`, `fmax`
1 parent 3e5fafd commit dbf4525

File tree

6 files changed

+1869
-7
lines changed

6 files changed

+1869
-7
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2470,7 +2470,7 @@ def int_amdgcn_s_quadmask :
24702470
def int_amdgcn_s_wqm :
24712471
DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>;
24722472

2473-
class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
2473+
class AMDGPUWaveReduce<LLVMType data_ty = llvm_any_ty> : Intrinsic<
24742474
[data_ty],
24752475
[
24762476
LLVMMatchType<0>, // llvm value to reduce (SGPR/VGPR)
@@ -2482,7 +2482,7 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
24822482

24832483
multiclass AMDGPUWaveReduceOps {
24842484
foreach Op =
2485-
["umin", "min", "umax", "max", "add", "sub", "and", "or", "xor"] in {
2485+
["umin", "fmin", "min", "umax", "fmax", "max", "add", "sub", "and", "or", "xor"] in {
24862486
def Op : AMDGPUWaveReduce;
24872487
}
24882488
}

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5217,8 +5217,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
52175217
case Intrinsic::amdgcn_wave_reduce_sub:
52185218
case Intrinsic::amdgcn_wave_reduce_min:
52195219
case Intrinsic::amdgcn_wave_reduce_umin:
5220+
case Intrinsic::amdgcn_wave_reduce_fmin:
52205221
case Intrinsic::amdgcn_wave_reduce_max:
52215222
case Intrinsic::amdgcn_wave_reduce_umax:
5223+
case Intrinsic::amdgcn_wave_reduce_fmax:
52225224
case Intrinsic::amdgcn_wave_reduce_and:
52235225
case Intrinsic::amdgcn_wave_reduce_or:
52245226
case Intrinsic::amdgcn_wave_reduce_xor: {

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5487,6 +5487,9 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
54875487
return std::numeric_limits<uint32_t>::min();
54885488
case AMDGPU::S_AND_B32:
54895489
return std::numeric_limits<uint32_t>::max();
5490+
case AMDGPU::V_MIN_F32_e64:
5491+
case AMDGPU::V_MAX_F32_e64:
5492+
return 0x7fc00000; // qNAN
54905493
default:
54915494
llvm_unreachable(
54925495
"Unexpected opcode in getIdentityValueFor32BitWaveReduction");
@@ -5521,7 +5524,12 @@ static bool is32bitWaveReduceOperation(unsigned Opc) {
55215524
Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
55225525
Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
55235526
Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5524-
Opc == AMDGPU::S_XOR_B32;
5527+
Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
5528+
Opc == AMDGPU::V_MAX_F32_e64;
5529+
}
5530+
5531+
static bool isFloatingPointWaveReduceOperation(unsigned Opc) {
5532+
return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64;
55255533
}
55265534

55275535
static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
@@ -5542,8 +5550,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
55425550
switch (Opc) {
55435551
case AMDGPU::S_MIN_U32:
55445552
case AMDGPU::S_MIN_I32:
5553+
case AMDGPU::V_MIN_F32_e64:
55455554
case AMDGPU::S_MAX_U32:
55465555
case AMDGPU::S_MAX_I32:
5556+
case AMDGPU::V_MAX_F32_e64:
55475557
case AMDGPU::S_AND_B32:
55485558
case AMDGPU::S_OR_B32: {
55495559
// Idempotent operations.
@@ -5739,6 +5749,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
57395749
MachineBasicBlock::iterator I = BB.end();
57405750
Register SrcReg = MI.getOperand(1).getReg();
57415751
bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5752+
bool isFPOp = isFloatingPointWaveReduceOperation(Opc);
57425753

57435754
// Create Control flow for loop
57445755
// Split MI's Machine Basic block into For loop
@@ -5798,9 +5809,29 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
57985809
LaneValueReg)
57995810
.addReg(SrcReg)
58005811
.addReg(FF1Reg);
5801-
NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5802-
.addReg(Accumulator->getOperand(0).getReg())
5803-
.addReg(LaneValueReg);
5812+
if (isFPOp) {
5813+
Register LaneValVreg =
5814+
MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
5815+
Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
5816+
// Get the Lane Value in VGPR to avoid the Constant Bus Restriction
5817+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32),
5818+
LaneValVreg)
5819+
.addReg(LaneValueReg);
5820+
BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
5821+
.addImm(0) // src0 modifier
5822+
.addReg(Accumulator->getOperand(0).getReg())
5823+
.addImm(0) // src1 modifier
5824+
.addReg(LaneValVreg)
5825+
.addImm(0) // clamp
5826+
.addImm(0); // omod
5827+
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5828+
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5829+
.addReg(DstVreg);
5830+
} else {
5831+
NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5832+
.addReg(Accumulator->getOperand(0).getReg())
5833+
.addReg(LaneValueReg);
5834+
}
58045835
} else {
58055836
Register LaneValueLoReg =
58065837
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
@@ -5932,6 +5963,8 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
59325963
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
59335964
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
59345965
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
5966+
case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
5967+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MIN_F32_e64);
59355968
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
59365969
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
59375970
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
@@ -5940,6 +5973,8 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
59405973
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
59415974
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
59425975
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
5976+
case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
5977+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MAX_F32_e64);
59435978
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
59445979
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
59455980
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -348,7 +348,7 @@ class WaveReduceOp<string OpName, string TypeStr, ValueType Ty,
348348

349349
// Input list : [Operation_name,
350350
// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B),
351-
// bit-width
351+
// input-type
352352
// output register class,
353353
// input register class]
354354
defvar Operations = [
@@ -371,6 +371,9 @@ defvar Operations = [
371371
WaveReduceOp<"and", "B64", i64, SGPR_64, VSrc_b64>,
372372
WaveReduceOp<"or", "B64", i64, SGPR_64, VSrc_b64>,
373373
WaveReduceOp<"xor", "B64", i64, SGPR_64, VSrc_b64>,
374+
375+
WaveReduceOp<"fmin", "F32", f32, SGPR_32, VSrc_b32>,
376+
WaveReduceOp<"fmax", "F32", f32, SGPR_32, VSrc_b32>,
374377
];
375378

376379
foreach Op = Operations in {

0 commit comments

Comments
 (0)